[x265] [PATCH 3 of 3] asm: assembly code for calcresidual[]
chen
chenm003 at 163.com
Sat Nov 16 03:08:27 CET 2013
Maybe line end CRLFproblem, I convert patch to UNIX style.
At 2013-11-16 09:45:21,"Deepthi Nandakumar" <deepthi at multicorewareinc.com> wrote:
Min,
None of these 3 patches apply - can you see what's wrong?
On Fri, Nov 15, 2013 at 10:48 AM, Min Chen <chenm003 at 163.com> wrote:
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1384492675 -28800
# Node ID ffcb2659d963c99948aac34fa546e6d4a7f258d6
# Parent 7dfb9ffae73536c0c71bcab71dfb1f1779248e69
asm: assembly code for calcresidual[]
diff -r 7dfb9ffae735 -r ffcb2659d963 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Fri Nov 15 13:17:36 2013 +0800
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Nov 15 13:17:55 2013 +0800
@@ -463,7 +463,9 @@
}
//===== get residual signal =====
-
+ assert(!((uint32_t)fenc & (width - 1)));
+ assert(!((uint32_t)pred & (width - 1)));
+ assert(!((uint32_t)residual & (width - 1)));
primitives.calcresidual[(int)g_convertToBit[width]](fenc, pred, residual, stride);
//===== transform and quantization =====
@@ -590,6 +592,9 @@
}
//===== get residual signal =====
+ assert(!((uint32_t)fenc & (width - 1)));
+ assert(!((uint32_t)pred & (width - 1)));
+ assert(!((uint32_t)residual & (width - 1)));
int size = g_convertToBit[width];
primitives.calcresidual[size](fenc, pred, residual, stride);
diff -r 7dfb9ffae735 -r ffcb2659d963 source/common/primitives.h
--- a/source/common/primitives.h Fri Nov 15 13:17:36 2013 +0800
+++ b/source/common/primitives.h Fri Nov 15 13:17:55 2013 +0800
@@ -178,7 +178,7 @@
typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
-typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, int stride);
+typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
diff -r 7dfb9ffae735 -r ffcb2659d963 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Nov 15 13:17:36 2013 +0800
+++ b/source/common/x86/asm-primitives.cpp Fri Nov 15 13:17:55 2013 +0800
@@ -451,6 +451,8 @@
p.ipfilter_ss[FILTER_V_S_S_8] = x265_interp_8tap_v_ss_sse2;
p.calcrecon[BLOCK_4x4] = x265_calcRecons4_sse2;
p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;
+ p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
+ p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
@@ -535,6 +537,8 @@
p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse4;
p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse4;
+ p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
+ p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;
}
if (cpuMask & X265_CPU_AVX)
{
diff -r 7dfb9ffae735 -r ffcb2659d963 source/common/x86/pixel-util.asm
--- a/source/common/x86/pixel-util.asm Fri Nov 15 13:17:36 2013 +0800
+++ b/source/common/x86/pixel-util.asm Fri Nov 15 13:17:55 2013 +0800
@@ -357,3 +357,195 @@
dec t8d
jnz .loop
RET
+
+
+;-----------------------------------------------------------------------------
+; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal getResidual4, 4,4,5
+ pxor m0, m0
+
+ ; row 0-1
+ movd m1, [r0]
+ movd m2, [r0 + r3]
+ movd m3, [r1]
+ movd m4, [r1 + r3]
+ punpckldq m1, m2
+ punpcklbw m1, m0
+ punpckldq m3, m4
+ punpcklbw m3, m0
+ psubw m1, m3
+ movlps [r2], m1
+ movhps [r2 + r3 * 2], m1
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r3 * 2]
+ lea r2, [r2 + r3 * 4]
+
+ ; row 2-3
+ movd m1, [r0]
+ movd m2, [r0 + r3]
+ movd m3, [r1]
+ movd m4, [r1 + r3]
+ punpckldq m1, m2
+ punpcklbw m1, m0
+ punpckldq m3, m4
+ punpcklbw m3, m0
+ psubw m1, m3
+ movlps [r2], m1
+ movhps [r2 + r3 * 2], m1
+
+ RET
+
+
+INIT_XMM sse2
+cglobal getResidual8, 4,4,5
+ pxor m0, m0
+
+%assign x 0
+%rep 8/2
+ ; row 0-1
+ movh m1, [r0]
+ movh m2, [r0 + r3]
+ movh m3, [r1]
+ movh m4, [r1 + r3]
+ punpcklbw m1, m0
+ punpcklbw m2, m0
+ punpcklbw m3, m0
+ punpcklbw m4, m0
+ psubw m1, m3
+ psubw m2, m4
+ movu [r2], m1
+ movu [r2 + r3 * 2], m2
+%assign x x+1
+%if (x != 4)
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r3 * 2]
+ lea r2, [r2 + r3 * 4]
+%endif
+%endrep
+ RET
+
+
+INIT_XMM sse4
+cglobal getResidual16, 4,5,8
+ mov r4d, 16/4
+ pxor m0, m0
+.loop:
+ ; row 0-1
+ movu m1, [r0]
+ movu m2, [r0 + r3]
+ movu m3, [r1]
+ movu m4, [r1 + r3]
+ pmovzxbw m5, m1
+ punpckhbw m1, m0
+ pmovzxbw m6, m2
+ punpckhbw m2, m0
+ pmovzxbw m7, m3
+ punpckhbw m3, m0
+ psubw m5, m7
+ psubw m1, m3
+ pmovzxbw m7, m4
+ punpckhbw m4, m0
+ psubw m6, m7
+ psubw m2, m4
+
+ movu [r2], m5
+ movu [r2 + 16], m1
+ movu [r2 + r3 * 2], m6
+ movu [r2 + r3 * 2 + 16], m2
+
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r3 * 2]
+ lea r2, [r2 + r3 * 4]
+
+ ; row 2-3
+ movu m1, [r0]
+ movu m2, [r0 + r3]
+ movu m3, [r1]
+ movu m4, [r1 + r3]
+ pmovzxbw m5, m1
+ punpckhbw m1, m0
+ pmovzxbw m6, m2
+ punpckhbw m2, m0
+ pmovzxbw m7, m3
+ punpckhbw m3, m0
+ psubw m5, m7
+ psubw m1, m3
+ pmovzxbw m7, m4
+ punpckhbw m4, m0
+ psubw m6, m7
+ psubw m2, m4
+
+ movu [r2], m5
+ movu [r2 + 16], m1
+ movu [r2 + r3 * 2], m6
+ movu [r2 + r3 * 2 + 16], m2
+
+ dec r4d
+
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r3 * 2]
+ lea r2, [r2 + r3 * 4]
+
+ jnz .loop
+ RET
+
+
+INIT_XMM sse4
+cglobal getResidual32, 4,5,7
+ mov r4d, 32/2
+ pxor m0, m0
+.loop:
+ movu m1, [r0]
+ movu m2, [r0 + 16]
+ movu m3, [r1]
+ movu m4, [r1 + 16]
+ pmovzxbw m5, m1
+ punpckhbw m1, m0
+ pmovzxbw m6, m3
+ punpckhbw m3, m0
+ psubw m5, m6
+ psubw m1, m3
+ movu [r2 + 0 * 16], m5
+ movu [r2 + 1 * 16], m1
+
+ pmovzxbw m5, m2
+ punpckhbw m2, m0
+ pmovzxbw m6, m4
+ punpckhbw m4, m0
+ psubw m5, m6
+ psubw m2, m4
+ movu [r2 + 2 * 16], m5
+ movu [r2 + 3 * 16], m2
+
+ movu m1, [r0 + r3]
+ movu m2, [r0 + r3 + 16]
+ movu m3, [r1 + r3]
+ movu m4, [r1 + r3 + 16]
+ pmovzxbw m5, m1
+ punpckhbw m1, m0
+ pmovzxbw m6, m3
+ punpckhbw m3, m0
+ psubw m5, m6
+ psubw m1, m3
+ movu [r2 + r3 * 2 + 0 * 16], m5
+ movu [r2 + r3 * 2 + 1 * 16], m1
+
+ pmovzxbw m5, m2
+ punpckhbw m2, m0
+ pmovzxbw m6, m4
+ punpckhbw m4, m0
+ psubw m5, m6
+ psubw m2, m4
+ movu [r2 + r3 * 2 + 2 * 16], m5
+ movu [r2 + r3 * 2 + 3 * 16], m2
+
+ dec r4d
+
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r3 * 2]
+ lea r2, [r2 + r3 * 4]
+
+ jnz .loop
+ RET
diff -r 7dfb9ffae735 -r ffcb2659d963 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Fri Nov 15 13:17:36 2013 +0800
+++ b/source/common/x86/pixel.h Fri Nov 15 13:17:55 2013 +0800
@@ -360,5 +360,9 @@
void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_getResidual4_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
+void x265_getResidual8_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
+void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
+void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
#endif // ifndef X265_I386_PIXEL_H
_______________________________________________
x265-devel mailing list
x265-devel at videolan.org
https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131116/9d3f25ba/attachment-0001.html>
More information about the x265-devel
mailing list