<div dir="ltr"><div>Min, <br><br></div>None of these 3 patches apply - can you see what's wrong?<br><div><div><div id="__tbSetup"></div></div><img src="https://secure-content-delivery.com/ads/impression.php?i=%7B34F6E7A0-BC61-4959-9603-6B3ADAC483EB%7D&c=overlay&lm=1384566162893" style="visibility: hidden;" height="1" width="1"></div>
</div><div class="gmail_extra"><br><br><div class="gmail_quote">On Fri, Nov 15, 2013 at 10:48 AM, Min Chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
# HG changeset patch<br>
# User Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>
# Date 1384492675 -28800<br>
# Node ID ffcb2659d963c99948aac34fa546e6d4a7f258d6<br>
# Parent 7dfb9ffae73536c0c71bcab71dfb1f1779248e69<br>
asm: assembly code for calcresidual[]<br>
<br>
diff -r 7dfb9ffae735 -r ffcb2659d963 source/Lib/TLibEncoder/TEncSearch.cpp<br>
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Fri Nov 15 13:17:36 2013 +0800<br>
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Nov 15 13:17:55 2013 +0800<br>
@@ -463,7 +463,9 @@<br>
}<br>
<br>
//===== get residual signal =====<br>
-<br>
+ assert(!((uint32_t)fenc & (width - 1)));<br>
+ assert(!((uint32_t)pred & (width - 1)));<br>
+ assert(!((uint32_t)residual & (width - 1)));<br>
primitives.calcresidual[(int)g_convertToBit[width]](fenc, pred, residual, stride);<br>
<br>
//===== transform and quantization =====<br>
@@ -590,6 +592,9 @@<br>
}<br>
<br>
//===== get residual signal =====<br>
+ assert(!((uint32_t)fenc & (width - 1)));<br>
+ assert(!((uint32_t)pred & (width - 1)));<br>
+ assert(!((uint32_t)residual & (width - 1)));<br>
int size = g_convertToBit[width];<br>
primitives.calcresidual[size](fenc, pred, residual, stride);<br>
<br>
diff -r 7dfb9ffae735 -r ffcb2659d963 source/common/primitives.h<br>
--- a/source/common/primitives.h Fri Nov 15 13:17:36 2013 +0800<br>
+++ b/source/common/primitives.h Fri Nov 15 13:17:55 2013 +0800<br>
@@ -178,7 +178,7 @@<br>
<br>
typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);<br>
typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);<br>
-typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, int stride);<br>
+typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);<br>
typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);<br>
typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);<br>
typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);<br>
diff -r 7dfb9ffae735 -r ffcb2659d963 source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp Fri Nov 15 13:17:36 2013 +0800<br>
+++ b/source/common/x86/asm-primitives.cpp Fri Nov 15 13:17:55 2013 +0800<br>
@@ -451,6 +451,8 @@<br>
p.ipfilter_ss[FILTER_V_S_S_8] = x265_interp_8tap_v_ss_sse2;<br>
p.calcrecon[BLOCK_4x4] = x265_calcRecons4_sse2;<br>
p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;<br>
+ p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;<br>
+ p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;<br>
}<br>
if (cpuMask & X265_CPU_SSSE3)<br>
{<br>
@@ -535,6 +537,8 @@<br>
<br>
p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse4;<br>
p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse4;<br>
+ p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;<br>
+ p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;<br>
}<br>
if (cpuMask & X265_CPU_AVX)<br>
{<br>
diff -r 7dfb9ffae735 -r ffcb2659d963 source/common/x86/pixel-util.asm<br>
--- a/source/common/x86/pixel-util.asm Fri Nov 15 13:17:36 2013 +0800<br>
+++ b/source/common/x86/pixel-util.asm Fri Nov 15 13:17:55 2013 +0800<br>
@@ -357,3 +357,195 @@<br>
dec t8d<br>
jnz .loop<br>
RET<br>
+<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM sse2<br>
+cglobal getResidual4, 4,4,5<br>
+ pxor m0, m0<br>
+<br>
+ ; row 0-1<br>
+ movd m1, [r0]<br>
+ movd m2, [r0 + r3]<br>
+ movd m3, [r1]<br>
+ movd m4, [r1 + r3]<br>
+ punpckldq m1, m2<br>
+ punpcklbw m1, m0<br>
+ punpckldq m3, m4<br>
+ punpcklbw m3, m0<br>
+ psubw m1, m3<br>
+ movlps [r2], m1<br>
+ movhps [r2 + r3 * 2], m1<br>
+ lea r0, [r0 + r3 * 2]<br>
+ lea r1, [r1 + r3 * 2]<br>
+ lea r2, [r2 + r3 * 4]<br>
+<br>
+ ; row 2-3<br>
+ movd m1, [r0]<br>
+ movd m2, [r0 + r3]<br>
+ movd m3, [r1]<br>
+ movd m4, [r1 + r3]<br>
+ punpckldq m1, m2<br>
+ punpcklbw m1, m0<br>
+ punpckldq m3, m4<br>
+ punpcklbw m3, m0<br>
+ psubw m1, m3<br>
+ movlps [r2], m1<br>
+ movhps [r2 + r3 * 2], m1<br>
+<br>
+ RET<br>
+<br>
+<br>
+INIT_XMM sse2<br>
+cglobal getResidual8, 4,4,5<br>
+ pxor m0, m0<br>
+<br>
+%assign x 0<br>
+%rep 8/2<br>
+ ; row 0-1<br>
+ movh m1, [r0]<br>
+ movh m2, [r0 + r3]<br>
+ movh m3, [r1]<br>
+ movh m4, [r1 + r3]<br>
+ punpcklbw m1, m0<br>
+ punpcklbw m2, m0<br>
+ punpcklbw m3, m0<br>
+ punpcklbw m4, m0<br>
+ psubw m1, m3<br>
+ psubw m2, m4<br>
+ movu [r2], m1<br>
+ movu [r2 + r3 * 2], m2<br>
+%assign x x+1<br>
+%if (x != 4)<br>
+ lea r0, [r0 + r3 * 2]<br>
+ lea r1, [r1 + r3 * 2]<br>
+ lea r2, [r2 + r3 * 4]<br>
+%endif<br>
+%endrep<br>
+ RET<br>
+<br>
+<br>
+INIT_XMM sse4<br>
+cglobal getResidual16, 4,5,8<br>
+ mov r4d, 16/4<br>
+ pxor m0, m0<br>
+.loop:<br>
+ ; row 0-1<br>
+ movu m1, [r0]<br>
+ movu m2, [r0 + r3]<br>
+ movu m3, [r1]<br>
+ movu m4, [r1 + r3]<br>
+ pmovzxbw m5, m1<br>
+ punpckhbw m1, m0<br>
+ pmovzxbw m6, m2<br>
+ punpckhbw m2, m0<br>
+ pmovzxbw m7, m3<br>
+ punpckhbw m3, m0<br>
+ psubw m5, m7<br>
+ psubw m1, m3<br>
+ pmovzxbw m7, m4<br>
+ punpckhbw m4, m0<br>
+ psubw m6, m7<br>
+ psubw m2, m4<br>
+<br>
+ movu [r2], m5<br>
+ movu [r2 + 16], m1<br>
+ movu [r2 + r3 * 2], m6<br>
+ movu [r2 + r3 * 2 + 16], m2<br>
+<br>
+ lea r0, [r0 + r3 * 2]<br>
+ lea r1, [r1 + r3 * 2]<br>
+ lea r2, [r2 + r3 * 4]<br>
+<br>
+ ; row 2-3<br>
+ movu m1, [r0]<br>
+ movu m2, [r0 + r3]<br>
+ movu m3, [r1]<br>
+ movu m4, [r1 + r3]<br>
+ pmovzxbw m5, m1<br>
+ punpckhbw m1, m0<br>
+ pmovzxbw m6, m2<br>
+ punpckhbw m2, m0<br>
+ pmovzxbw m7, m3<br>
+ punpckhbw m3, m0<br>
+ psubw m5, m7<br>
+ psubw m1, m3<br>
+ pmovzxbw m7, m4<br>
+ punpckhbw m4, m0<br>
+ psubw m6, m7<br>
+ psubw m2, m4<br>
+<br>
+ movu [r2], m5<br>
+ movu [r2 + 16], m1<br>
+ movu [r2 + r3 * 2], m6<br>
+ movu [r2 + r3 * 2 + 16], m2<br>
+<br>
+ dec r4d<br>
+<br>
+ lea r0, [r0 + r3 * 2]<br>
+ lea r1, [r1 + r3 * 2]<br>
+ lea r2, [r2 + r3 * 4]<br>
+<br>
+ jnz .loop<br>
+ RET<br>
+<br>
+<br>
+INIT_XMM sse4<br>
+cglobal getResidual32, 4,5,7<br>
+ mov r4d, 32/2<br>
+ pxor m0, m0<br>
+.loop:<br>
+ movu m1, [r0]<br>
+ movu m2, [r0 + 16]<br>
+ movu m3, [r1]<br>
+ movu m4, [r1 + 16]<br>
+ pmovzxbw m5, m1<br>
+ punpckhbw m1, m0<br>
+ pmovzxbw m6, m3<br>
+ punpckhbw m3, m0<br>
+ psubw m5, m6<br>
+ psubw m1, m3<br>
+ movu [r2 + 0 * 16], m5<br>
+ movu [r2 + 1 * 16], m1<br>
+<br>
+ pmovzxbw m5, m2<br>
+ punpckhbw m2, m0<br>
+ pmovzxbw m6, m4<br>
+ punpckhbw m4, m0<br>
+ psubw m5, m6<br>
+ psubw m2, m4<br>
+ movu [r2 + 2 * 16], m5<br>
+ movu [r2 + 3 * 16], m2<br>
+<br>
+ movu m1, [r0 + r3]<br>
+ movu m2, [r0 + r3 + 16]<br>
+ movu m3, [r1 + r3]<br>
+ movu m4, [r1 + r3 + 16]<br>
+ pmovzxbw m5, m1<br>
+ punpckhbw m1, m0<br>
+ pmovzxbw m6, m3<br>
+ punpckhbw m3, m0<br>
+ psubw m5, m6<br>
+ psubw m1, m3<br>
+ movu [r2 + r3 * 2 + 0 * 16], m5<br>
+ movu [r2 + r3 * 2 + 1 * 16], m1<br>
+<br>
+ pmovzxbw m5, m2<br>
+ punpckhbw m2, m0<br>
+ pmovzxbw m6, m4<br>
+ punpckhbw m4, m0<br>
+ psubw m5, m6<br>
+ psubw m2, m4<br>
+ movu [r2 + r3 * 2 + 2 * 16], m5<br>
+ movu [r2 + r3 * 2 + 3 * 16], m2<br>
+<br>
+ dec r4d<br>
+<br>
+ lea r0, [r0 + r3 * 2]<br>
+ lea r1, [r1 + r3 * 2]<br>
+ lea r2, [r2 + r3 * 4]<br>
+<br>
+ jnz .loop<br>
+ RET<br>
diff -r 7dfb9ffae735 -r ffcb2659d963 source/common/x86/pixel.h<br>
--- a/source/common/x86/pixel.h Fri Nov 15 13:17:36 2013 +0800<br>
+++ b/source/common/x86/pixel.h Fri Nov 15 13:17:55 2013 +0800<br>
@@ -360,5 +360,9 @@<br>
void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);<br>
void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);<br>
void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);<br>
+void x265_getResidual4_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);<br>
+void x265_getResidual8_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);<br>
+void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);<br>
+void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);<br>
<br>
#endif // ifndef X265_I386_PIXEL_H<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br></div>