<div dir="ltr"><br><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Steve Borho</b> <span dir="ltr"><<a href="mailto:steve@borho.org">steve@borho.org</a>></span><br>
Date: Mon, Oct 28, 2013 at 11:55 PM<br>Subject: Re: [x265] [PATCH 4 of 4] asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]<br>To: Development for x265 <<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a>><br>
<br><br><div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote"><div class="im">On Mon, Oct 28, 2013 at 9:24 AM, Min Chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"># HG changeset patch<br>
# User Min Chen <<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>><br>
# Date 1382970234 -28800<br>
# Node ID 41425f18efe14be468715bfa68fdebbb9a49145f<br>
# Parent 5f7b3d06d94c6aec44bfd4a7bfb6f6751182b4ed<br>
asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]<br></blockquote><div><br></div><div><br></div></div><div>>>I'm getting link errors on x86_64 from this series:</div><div><br></div><div>>>error LNK2017: 'ADDR32' relocation to 'tab_LumaCoeffV' invalid without /LARGEADDRESSAWARE:NO<br>
</div><div><br></div><div>This error is due to [register + global_constant] 64-bit does not support it. I generally use PIC macro to protect it. like</div><div><br></div><div><div>%ifdef PIC</div><div>lea r5, [tab_ChromaCoeff]</div>
<div>movd m0, [r5 + r4 * 4]</div><div>%else</div><div>movd m0, [tab_ChromaCoeff + r4 * 4]</div><div>%endif</div></div><div><br></div><div>>>In general, I think we should drop all of the interpolation merging while we get all the assembly completed for motion compensation. When the assembly is alltogether, we can experiment and figure out if it makes sense to re->>merge some of them back together.</div>
<div><div class="h5">
<div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">
diff -r 5f7b3d06d94c -r 41425f18efe1 source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp Mon Oct 28 22:23:29 2013 +0800<br>
+++ b/source/common/x86/asm-primitives.cpp Mon Oct 28 22:23:54 2013 +0800<br>
@@ -280,6 +280,7 @@<br>
p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;<br>
<br>
p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;<br>
+ p.ipfilter_sp[FILTER_V_S_P_8] = x265_interp_8tap_v_sp_ssse3;<br>
}<br>
if (cpuMask & X265_CPU_SSE4)<br>
{<br>
diff -r 5f7b3d06d94c -r 41425f18efe1 source/common/x86/ipfilter8.asm<br>
--- a/source/common/x86/ipfilter8.asm Mon Oct 28 22:23:29 2013 +0800<br>
+++ b/source/common/x86/ipfilter8.asm Mon Oct 28 22:23:54 2013 +0800<br>
@@ -774,3 +774,114 @@<br>
jnz .loopV<br>
<br>
RET<br>
+<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_8tap_v_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM ssse3<br>
+cglobal interp_8tap_v_sp, 4, 7, 8, 0-(2*4 + 3*gprsize)<br>
+%define old_r0 (rsp + 2 * 4 + 0 * gprsize)<br>
+%define old_r2 (rsp + 2 * 4 + 1 * gprsize)<br>
+%define old_r3 (rsp + 2 * 4 + 2 * gprsize)<br>
+%define old_r4d (rsp + 0 * 4)<br>
+%define old_6rows (rsp + 1 * 4)<br>
+<br>
+ mov r4d, r4m<br>
+ mov r5d, r5m<br>
+<br>
+ ; load coeff table<br>
+ mov r6d, r6m<br>
+ shl r6, 6<br>
+ lea r6, [tab_LumaCoeffV + r6]<br>
+<br>
+ mov [old_r4d], r4d<br>
+ mov [old_r2], r2<br>
+<br>
+ ; move to -3<br>
+ lea r1, [r1 * 2]<br>
+ lea r4, [r1 + r1 * 2]<br>
+ sub r0, r4<br>
+ lea r4, [r4 * 2]<br>
+ mov [old_6rows], r4<br>
+<br>
+.loopH:<br>
+<br>
+ ; load width<br>
+ mov r4d, [old_r4d]<br>
+<br>
+ ; save old src<br>
+ mov [old_r0], r0<br>
+<br>
+.loopW:<br>
+<br>
+ movu m0, [r0]<br>
+ movu m1, [r0 + r1]<br>
+ lea r0, [r0 + r1 * 2]<br>
+ punpcklwd m2, m0, m1<br>
+ pmaddwd m2, [r6 + 0 * 16]<br>
+ punpckhwd m0, m1<br>
+ pmaddwd m0, [r6 + 0 * 16]<br>
+<br>
+ movu m3, [r0]<br>
+ movu m4, [r0 + r1]<br>
+ lea r0, [r0 + r1 * 2]<br>
+ punpcklwd m1, m3, m4<br>
+ pmaddwd m1, [r6 + 1 * 16]<br>
+ paddd m2, m1<br>
+ punpckhwd m3, m4<br>
+ pmaddwd m3, [r6 + 1 * 16]<br>
+ paddd m0, m3<br>
+<br>
+ movu m3, [r0]<br>
+ movu m4, [r0 + r1]<br>
+ lea r0, [r0 + r1 * 2]<br>
+ punpcklwd m1, m3, m4<br>
+ pmaddwd m1, [r6 + 2 * 16]<br>
+ paddd m2, m1<br>
+ punpckhwd m3, m4<br>
+ pmaddwd m3, [r6 + 2 * 16]<br>
+ paddd m0, m3<br>
+<br>
+ movu m3, [r0]<br>
+ movu m4, [r0 + r1]<br>
+ punpcklwd m1, m3, m4<br>
+ pmaddwd m1, [r6 + 3 * 16]<br>
+ paddd m2, m1<br>
+ punpckhwd m3, m4<br>
+ pmaddwd m3, [r6 + 3 * 16]<br>
+ paddd m0, m3<br>
+<br>
+ paddd m2, [tab_c_526336]<br>
+ paddd m0, [tab_c_526336]<br>
+ psrad m2, 12<br>
+ psrad m0, 12<br>
+ packssdw m2, m0<br>
+ packuswb m2, m2<br>
+<br>
+ ; move to next 8 col<br>
+ sub r0, [old_6rows]<br>
+<br>
+ sub r4, 8<br>
+ jl .width4<br>
+ movq [r2], m2<br>
+ je .nextH<br>
+ lea r0, [r0 + 16]<br>
+ lea r2, [r2 + 8]<br>
+ jmp .loopW<br>
+<br>
+.width4:<br>
+ movd [r2], m2<br>
+ lea r0, [r0 + 4]<br>
+<br>
+.nextH:<br>
+ ; move to next row<br>
+ mov r0, [old_r0]<br>
+ lea r0, [r0 + r1]<br>
+ add [old_r2], r3d<br>
+ mov r2, [old_r2]<br>
+<br>
+ dec r5d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
diff -r 5f7b3d06d94c -r 41425f18efe1 source/common/x86/ipfilter8.h<br>
--- a/source/common/x86/ipfilter8.h Mon Oct 28 22:23:29 2013 +0800<br>
+++ b/source/common/x86/ipfilter8.h Mon Oct 28 22:23:54 2013 +0800<br>
@@ -89,6 +89,7 @@<br>
LUMA_FILTERS(_sse4);<br>
<br>
void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);<br>
+void x265_interp_8tap_v_sp_ssse3(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);<br>
<br>
#undef SETUP_CHROMA_FUNC_DEF<br>
#undef SETUP_LUMA_FUNC_DEF<br>
diff -r 5f7b3d06d94c -r 41425f18efe1 source/test/ipfilterharness.cpp<br>
--- a/source/test/ipfilterharness.cpp Mon Oct 28 22:23:29 2013 +0800<br>
+++ b/source/test/ipfilterharness.cpp Mon Oct 28 22:23:54 2013 +0800<br>
@@ -164,6 +164,8 @@<br>
int rand_width = rand() % 100; // Randomly generated Width<br>
int16_t rand_val, rand_srcStride, rand_dstStride;<br>
<br>
+ rand_width &= ~3;<br>
+<br>
for (int i = 0; i <= 100; i++)<br>
{<br>
memset(IPF_vec_output_p, 0, ipf_t_size); // Initialize output buffer to zero<br>
@@ -173,16 +175,16 @@<br>
rand_srcStride = rand() % 100; // Randomly generated srcStride<br>
rand_dstStride = rand() % 100; // Randomly generated dstStride<br>
<br>
- opt(short_buff + 3 * rand_srcStride,<br>
+ ref(short_buff + 3 * rand_srcStride,<br>
rand_srcStride,<br>
- IPF_vec_output_p,<br>
+ IPF_C_output_p,<br>
rand_dstStride,<br>
rand_width,<br>
rand_height, rand_val<br>
);<br>
- ref(short_buff + 3 * rand_srcStride,<br>
+ opt(short_buff + 3 * rand_srcStride,<br>
rand_srcStride,<br>
- IPF_C_output_p,<br>
+ IPF_vec_output_p,<br>
rand_dstStride,<br>
rand_width,<br>
rand_height, rand_val<br>
diff -r 5f7b3d06d94c -r 41425f18efe1 source/test/testbench.cpp<br>
--- a/source/test/testbench.cpp Mon Oct 28 22:23:29 2013 +0800<br>
+++ b/source/test/testbench.cpp Mon Oct 28 22:23:54 2013 +0800<br>
@@ -74,7 +74,7 @@<br>
}<br>
}<br>
<br>
- int seed = (int)time(NULL);<br>
+ int seed = 0x526E629B;//(int)time(NULL);<br>
const char *bpp[] = { "8bpp", "16bpp" };<br>
printf("Using random seed %X %s\n", seed, bpp[HIGH_BIT_DEPTH]);<br>
srand(seed);<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div></div></div><span class=""><font color="#888888"><br><br clear="all"><div><br></div>-- <br>Steve Borho
</font></span></div></div>
<br>_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
<br></div><br></div>