<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><DIV>Maybe line end CRLFproblem, I convert patch to UNIX style.<BR></DIV>
<DIV>At 2013-11-16 09:45:21,"Deepthi Nandakumar" <deepthi@multicorewareinc.com> wrote:<BR></DIV>
<BLOCKQUOTE id="isReplyContent" style="PADDING-LEFT: 1ex; MARGIN: 0px 0px 0px 0.8ex; BORDER-LEFT: #ccc 1px solid">
<DIV dir="ltr">
<DIV>Min, <BR><BR></DIV>None of these 3 patches apply - can you see what's wrong?<BR>
<DIV>
<DIV>
<DIV id="__tbSetup"></DIV></DIV><IMG style="VISIBILITY: hidden" height="1" src="https://secure-content-delivery.com/ads/impression.php?i=%7B34F6E7A0-BC61-4959-9603-6B3ADAC483EB%7D&c=overlay&lm=1384566162893" width="1"></DIV></DIV>
<DIV class="gmail_extra"><BR><BR>
<DIV class="gmail_quote">On Fri, Nov 15, 2013 at 10:48 AM, Min Chen <SPAN dir="ltr"><<A href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</A>></SPAN> wrote:<BR>
<BLOCKQUOTE class="gmail_quote" style="PADDING-LEFT: 1ex; MARGIN: 0px 0px 0px 0.8ex; BORDER-LEFT: #ccc 1px solid"># HG changeset patch<BR># User Min Chen <<A href="mailto:chenm003@163.com">chenm003@163.com</A>><BR># Date 1384492675 -28800<BR># Node ID ffcb2659d963c99948aac34fa546e6d4a7f258d6<BR># Parent 7dfb9ffae73536c0c71bcab71dfb1f1779248e69<BR>asm: assembly code for calcresidual[]<BR><BR>diff -r 7dfb9ffae735 -r ffcb2659d963 source/Lib/TLibEncoder/TEncSearch.cpp<BR>--- a/source/Lib/TLibEncoder/TEncSearch.cpp Fri Nov 15 13:17:36 2013 +0800<BR>+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Nov 15 13:17:55 2013 +0800<BR>@@ -463,7 +463,9 @@<BR> }<BR><BR> //===== get residual signal =====<BR>-<BR>+ assert(!((uint32_t)fenc & (width - 1)));<BR>+ assert(!((uint32_t)pred & (width - 1)));<BR>+ assert(!((uint32_t)residual & (width - 1)));<BR> primitives.calcresidual[(int)g_convertToBit[width]](fenc, pred, residual, stride);<BR><BR> //===== transform and quantization =====<BR>@@ -590,6 +592,9 @@<BR> }<BR><BR> //===== get residual signal =====<BR>+ assert(!((uint32_t)fenc & (width - 1)));<BR>+ assert(!((uint32_t)pred & (width - 1)));<BR>+ assert(!((uint32_t)residual & (width - 1)));<BR> int size = g_convertToBit[width];<BR> primitives.calcresidual[size](fenc, pred, residual, stride);<BR><BR>diff -r 7dfb9ffae735 -r ffcb2659d963 source/common/primitives.h<BR>--- a/source/common/primitives.h Fri Nov 15 13:17:36 2013 +0800<BR>+++ b/source/common/primitives.h Fri Nov 15 13:17:55 2013 +0800<BR>@@ -178,7 +178,7 @@<BR><BR> typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);<BR> typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);<BR>-typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, int stride);<BR>+typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);<BR> typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);<BR> typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);<BR> typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);<BR>diff -r 7dfb9ffae735 -r ffcb2659d963 source/common/x86/asm-primitives.cpp<BR>--- a/source/common/x86/asm-primitives.cpp Fri Nov 15 13:17:36 2013 +0800<BR>+++ b/source/common/x86/asm-primitives.cpp Fri Nov 15 13:17:55 2013 +0800<BR>@@ -451,6 +451,8 @@<BR> p.ipfilter_ss[FILTER_V_S_S_8] = x265_interp_8tap_v_ss_sse2;<BR> p.calcrecon[BLOCK_4x4] = x265_calcRecons4_sse2;<BR> p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;<BR>+ p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;<BR>+ p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;<BR> }<BR> if (cpuMask & X265_CPU_SSSE3)<BR> {<BR>@@ -535,6 +537,8 @@<BR><BR> p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse4;<BR> p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse4;<BR>+ p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;<BR>+ p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;<BR> }<BR> if (cpuMask & X265_CPU_AVX)<BR> {<BR>diff -r 7dfb9ffae735 -r ffcb2659d963 source/common/x86/pixel-util.asm<BR>--- a/source/common/x86/pixel-util.asm Fri Nov 15 13:17:36 2013 +0800<BR>+++ b/source/common/x86/pixel-util.asm Fri Nov 15 13:17:55 2013 +0800<BR>@@ -357,3 +357,195 @@<BR> dec t8d<BR> jnz .loop<BR> RET<BR>+<BR>+<BR>+;-----------------------------------------------------------------------------<BR>+; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)<BR>+;-----------------------------------------------------------------------------<BR>+INIT_XMM sse2<BR>+cglobal getResidual4, 4,4,5<BR>+ pxor m0, m0<BR>+<BR>+ ; row 0-1<BR>+ movd m1, [r0]<BR>+ movd m2, [r0 + r3]<BR>+ movd m3, [r1]<BR>+ movd m4, [r1 + r3]<BR>+ punpckldq m1, m2<BR>+ punpcklbw m1, m0<BR>+ punpckldq m3, m4<BR>+ punpcklbw m3, m0<BR>+ psubw m1, m3<BR>+ movlps [r2], m1<BR>+ movhps [r2 + r3 * 2], m1<BR>+ lea r0, [r0 + r3 * 2]<BR>+ lea r1, [r1 + r3 * 2]<BR>+ lea r2, [r2 + r3 * 4]<BR>+<BR>+ ; row 2-3<BR>+ movd m1, [r0]<BR>+ movd m2, [r0 + r3]<BR>+ movd m3, [r1]<BR>+ movd m4, [r1 + r3]<BR>+ punpckldq m1, m2<BR>+ punpcklbw m1, m0<BR>+ punpckldq m3, m4<BR>+ punpcklbw m3, m0<BR>+ psubw m1, m3<BR>+ movlps [r2], m1<BR>+ movhps [r2 + r3 * 2], m1<BR>+<BR>+ RET<BR>+<BR>+<BR>+INIT_XMM sse2<BR>+cglobal getResidual8, 4,4,5<BR>+ pxor m0, m0<BR>+<BR>+%assign x 0<BR>+%rep 8/2<BR>+ ; row 0-1<BR>+ movh m1, [r0]<BR>+ movh m2, [r0 + r3]<BR>+ movh m3, [r1]<BR>+ movh m4, [r1 + r3]<BR>+ punpcklbw m1, m0<BR>+ punpcklbw m2, m0<BR>+ punpcklbw m3, m0<BR>+ punpcklbw m4, m0<BR>+ psubw m1, m3<BR>+ psubw m2, m4<BR>+ movu [r2], m1<BR>+ movu [r2 + r3 * 2], m2<BR>+%assign x x+1<BR>+%if (x != 4)<BR>+ lea r0, [r0 + r3 * 2]<BR>+ lea r1, [r1 + r3 * 2]<BR>+ lea r2, [r2 + r3 * 4]<BR>+%endif<BR>+%endrep<BR>+ RET<BR>+<BR>+<BR>+INIT_XMM sse4<BR>+cglobal getResidual16, 4,5,8<BR>+ mov r4d, 16/4<BR>+ pxor m0, m0<BR>+.loop:<BR>+ ; row 0-1<BR>+ movu m1, [r0]<BR>+ movu m2, [r0 + r3]<BR>+ movu m3, [r1]<BR>+ movu m4, [r1 + r3]<BR>+ pmovzxbw m5, m1<BR>+ punpckhbw m1, m0<BR>+ pmovzxbw m6, m2<BR>+ punpckhbw m2, m0<BR>+ pmovzxbw m7, m3<BR>+ punpckhbw m3, m0<BR>+ psubw m5, m7<BR>+ psubw m1, m3<BR>+ pmovzxbw m7, m4<BR>+ punpckhbw m4, m0<BR>+ psubw m6, m7<BR>+ psubw m2, m4<BR>+<BR>+ movu [r2], m5<BR>+ movu [r2 + 16], m1<BR>+ movu [r2 + r3 * 2], m6<BR>+ movu [r2 + r3 * 2 + 16], m2<BR>+<BR>+ lea r0, [r0 + r3 * 2]<BR>+ lea r1, [r1 + r3 * 2]<BR>+ lea r2, [r2 + r3 * 4]<BR>+<BR>+ ; row 2-3<BR>+ movu m1, [r0]<BR>+ movu m2, [r0 + r3]<BR>+ movu m3, [r1]<BR>+ movu m4, [r1 + r3]<BR>+ pmovzxbw m5, m1<BR>+ punpckhbw m1, m0<BR>+ pmovzxbw m6, m2<BR>+ punpckhbw m2, m0<BR>+ pmovzxbw m7, m3<BR>+ punpckhbw m3, m0<BR>+ psubw m5, m7<BR>+ psubw m1, m3<BR>+ pmovzxbw m7, m4<BR>+ punpckhbw m4, m0<BR>+ psubw m6, m7<BR>+ psubw m2, m4<BR>+<BR>+ movu [r2], m5<BR>+ movu [r2 + 16], m1<BR>+ movu [r2 + r3 * 2], m6<BR>+ movu [r2 + r3 * 2 + 16], m2<BR>+<BR>+ dec r4d<BR>+<BR>+ lea r0, [r0 + r3 * 2]<BR>+ lea r1, [r1 + r3 * 2]<BR>+ lea r2, [r2 + r3 * 4]<BR>+<BR>+ jnz .loop<BR>+ RET<BR>+<BR>+<BR>+INIT_XMM sse4<BR>+cglobal getResidual32, 4,5,7<BR>+ mov r4d, 32/2<BR>+ pxor m0, m0<BR>+.loop:<BR>+ movu m1, [r0]<BR>+ movu m2, [r0 + 16]<BR>+ movu m3, [r1]<BR>+ movu m4, [r1 + 16]<BR>+ pmovzxbw m5, m1<BR>+ punpckhbw m1, m0<BR>+ pmovzxbw m6, m3<BR>+ punpckhbw m3, m0<BR>+ psubw m5, m6<BR>+ psubw m1, m3<BR>+ movu [r2 + 0 * 16], m5<BR>+ movu [r2 + 1 * 16], m1<BR>+<BR>+ pmovzxbw m5, m2<BR>+ punpckhbw m2, m0<BR>+ pmovzxbw m6, m4<BR>+ punpckhbw m4, m0<BR>+ psubw m5, m6<BR>+ psubw m2, m4<BR>+ movu [r2 + 2 * 16], m5<BR>+ movu [r2 + 3 * 16], m2<BR>+<BR>+ movu m1, [r0 + r3]<BR>+ movu m2, [r0 + r3 + 16]<BR>+ movu m3, [r1 + r3]<BR>+ movu m4, [r1 + r3 + 16]<BR>+ pmovzxbw m5, m1<BR>+ punpckhbw m1, m0<BR>+ pmovzxbw m6, m3<BR>+ punpckhbw m3, m0<BR>+ psubw m5, m6<BR>+ psubw m1, m3<BR>+ movu [r2 + r3 * 2 + 0 * 16], m5<BR>+ movu [r2 + r3 * 2 + 1 * 16], m1<BR>+<BR>+ pmovzxbw m5, m2<BR>+ punpckhbw m2, m0<BR>+ pmovzxbw m6, m4<BR>+ punpckhbw m4, m0<BR>+ psubw m5, m6<BR>+ psubw m2, m4<BR>+ movu [r2 + r3 * 2 + 2 * 16], m5<BR>+ movu [r2 + r3 * 2 + 3 * 16], m2<BR>+<BR>+ dec r4d<BR>+<BR>+ lea r0, [r0 + r3 * 2]<BR>+ lea r1, [r1 + r3 * 2]<BR>+ lea r2, [r2 + r3 * 4]<BR>+<BR>+ jnz .loop<BR>+ RET<BR>diff -r 7dfb9ffae735 -r ffcb2659d963 source/common/x86/pixel.h<BR>--- a/source/common/x86/pixel.h Fri Nov 15 13:17:36 2013 +0800<BR>+++ b/source/common/x86/pixel.h Fri Nov 15 13:17:55 2013 +0800<BR>@@ -360,5 +360,9 @@<BR> void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);<BR> void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);<BR> void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);<BR>+void x265_getResidual4_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);<BR>+void x265_getResidual8_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);<BR>+void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);<BR>+void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);<BR><BR> #endif // ifndef X265_I386_PIXEL_H<BR><BR>_______________________________________________<BR>x265-devel mailing list<BR><A href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</A><BR><A href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</A><BR></BLOCKQUOTE></DIV><BR></DIV></BLOCKQUOTE></div>