[x265] Fwd: [PATCH 4 of 4] asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]
Praveen Tiwari
praveen at multicorewareinc.com
Tue Oct 29 06:15:10 CET 2013
---------- Forwarded message ----------
From: Steve Borho <steve at borho.org>
Date: Mon, Oct 28, 2013 at 11:55 PM
Subject: Re: [x265] [PATCH 4 of 4] asm: interp_8tap_v_sp for
ipfilter_sp[FILTER_V_S_P_8]
To: Development for x265 <x265-devel at videolan.org>
On Mon, Oct 28, 2013 at 9:24 AM, Min Chen <chenm003 at 163.com> wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1382970234 -28800
> # Node ID 41425f18efe14be468715bfa68fdebbb9a49145f
> # Parent 5f7b3d06d94c6aec44bfd4a7bfb6f6751182b4ed
> asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]
>
>>I'm getting link errors on x86_64 from this series:
>>error LNK2017: 'ADDR32' relocation to 'tab_LumaCoeffV' invalid without
/LARGEADDRESSAWARE:NO
This error is due to [register + global_constant] 64-bit does not support
it. I generally use PIC macro to protect it. like
%ifdef PIC
lea r5, [tab_ChromaCoeff]
movd m0, [r5 + r4 * 4]
%else
movd m0, [tab_ChromaCoeff + r4 * 4]
%endif
>>In general, I think we should drop all of the interpolation merging while
we get all the assembly completed for motion compensation. When the
assembly is alltogether, we can experiment and figure out if it makes sense
to re->>merge some of them back together.
> diff -r 5f7b3d06d94c -r 41425f18efe1 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Mon Oct 28 22:23:29 2013
> +0800
> +++ b/source/common/x86/asm-primitives.cpp Mon Oct 28 22:23:54 2013
> +0800
> @@ -280,6 +280,7 @@
> p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
>
> p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
> + p.ipfilter_sp[FILTER_V_S_P_8] = x265_interp_8tap_v_sp_ssse3;
> }
> if (cpuMask & X265_CPU_SSE4)
> {
> diff -r 5f7b3d06d94c -r 41425f18efe1 source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm Mon Oct 28 22:23:29 2013 +0800
> +++ b/source/common/x86/ipfilter8.asm Mon Oct 28 22:23:54 2013 +0800
> @@ -774,3 +774,114 @@
> jnz .loopV
>
> RET
> +
> +
>
> +;-----------------------------------------------------------------------------
> +; void interp_8tap_v_sp(int16_t *src, intptr_t srcStride, pixel *dst,
> intptr_t dstStride, int width, int height, const int coeffIdx);
>
> +;-----------------------------------------------------------------------------
> +INIT_XMM ssse3
> +cglobal interp_8tap_v_sp, 4, 7, 8, 0-(2*4 + 3*gprsize)
> +%define old_r0 (rsp + 2 * 4 + 0 * gprsize)
> +%define old_r2 (rsp + 2 * 4 + 1 * gprsize)
> +%define old_r3 (rsp + 2 * 4 + 2 * gprsize)
> +%define old_r4d (rsp + 0 * 4)
> +%define old_6rows (rsp + 1 * 4)
> +
> + mov r4d, r4m
> + mov r5d, r5m
> +
> + ; load coeff table
> + mov r6d, r6m
> + shl r6, 6
> + lea r6, [tab_LumaCoeffV + r6]
> +
> + mov [old_r4d], r4d
> + mov [old_r2], r2
> +
> + ; move to -3
> + lea r1, [r1 * 2]
> + lea r4, [r1 + r1 * 2]
> + sub r0, r4
> + lea r4, [r4 * 2]
> + mov [old_6rows], r4
> +
> +.loopH:
> +
> + ; load width
> + mov r4d, [old_r4d]
> +
> + ; save old src
> + mov [old_r0], r0
> +
> +.loopW:
> +
> + movu m0, [r0]
> + movu m1, [r0 + r1]
> + lea r0, [r0 + r1 * 2]
> + punpcklwd m2, m0, m1
> + pmaddwd m2, [r6 + 0 * 16]
> + punpckhwd m0, m1
> + pmaddwd m0, [r6 + 0 * 16]
> +
> + movu m3, [r0]
> + movu m4, [r0 + r1]
> + lea r0, [r0 + r1 * 2]
> + punpcklwd m1, m3, m4
> + pmaddwd m1, [r6 + 1 * 16]
> + paddd m2, m1
> + punpckhwd m3, m4
> + pmaddwd m3, [r6 + 1 * 16]
> + paddd m0, m3
> +
> + movu m3, [r0]
> + movu m4, [r0 + r1]
> + lea r0, [r0 + r1 * 2]
> + punpcklwd m1, m3, m4
> + pmaddwd m1, [r6 + 2 * 16]
> + paddd m2, m1
> + punpckhwd m3, m4
> + pmaddwd m3, [r6 + 2 * 16]
> + paddd m0, m3
> +
> + movu m3, [r0]
> + movu m4, [r0 + r1]
> + punpcklwd m1, m3, m4
> + pmaddwd m1, [r6 + 3 * 16]
> + paddd m2, m1
> + punpckhwd m3, m4
> + pmaddwd m3, [r6 + 3 * 16]
> + paddd m0, m3
> +
> + paddd m2, [tab_c_526336]
> + paddd m0, [tab_c_526336]
> + psrad m2, 12
> + psrad m0, 12
> + packssdw m2, m0
> + packuswb m2, m2
> +
> + ; move to next 8 col
> + sub r0, [old_6rows]
> +
> + sub r4, 8
> + jl .width4
> + movq [r2], m2
> + je .nextH
> + lea r0, [r0 + 16]
> + lea r2, [r2 + 8]
> + jmp .loopW
> +
> +.width4:
> + movd [r2], m2
> + lea r0, [r0 + 4]
> +
> +.nextH:
> + ; move to next row
> + mov r0, [old_r0]
> + lea r0, [r0 + r1]
> + add [old_r2], r3d
> + mov r2, [old_r2]
> +
> + dec r5d
> + jnz .loopH
> +
> + RET
> diff -r 5f7b3d06d94c -r 41425f18efe1 source/common/x86/ipfilter8.h
> --- a/source/common/x86/ipfilter8.h Mon Oct 28 22:23:29 2013 +0800
> +++ b/source/common/x86/ipfilter8.h Mon Oct 28 22:23:54 2013 +0800
> @@ -89,6 +89,7 @@
> LUMA_FILTERS(_sse4);
>
> void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride,
> pixel * dst, intptr_t dstStride, int idxX, int idxY);
> +void x265_interp_8tap_v_sp_ssse3(int16_t *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
>
> #undef SETUP_CHROMA_FUNC_DEF
> #undef SETUP_LUMA_FUNC_DEF
> diff -r 5f7b3d06d94c -r 41425f18efe1 source/test/ipfilterharness.cpp
> --- a/source/test/ipfilterharness.cpp Mon Oct 28 22:23:29 2013 +0800
> +++ b/source/test/ipfilterharness.cpp Mon Oct 28 22:23:54 2013 +0800
> @@ -164,6 +164,8 @@
> int rand_width = rand() % 100; // Randomly generated
> Width
> int16_t rand_val, rand_srcStride, rand_dstStride;
>
> + rand_width &= ~3;
> +
> for (int i = 0; i <= 100; i++)
> {
> memset(IPF_vec_output_p, 0, ipf_t_size); // Initialize
> output buffer to zero
> @@ -173,16 +175,16 @@
> rand_srcStride = rand() % 100; // Randomly generated
> srcStride
> rand_dstStride = rand() % 100; // Randomly generated
> dstStride
>
> - opt(short_buff + 3 * rand_srcStride,
> + ref(short_buff + 3 * rand_srcStride,
> rand_srcStride,
> - IPF_vec_output_p,
> + IPF_C_output_p,
> rand_dstStride,
> rand_width,
> rand_height, rand_val
> );
> - ref(short_buff + 3 * rand_srcStride,
> + opt(short_buff + 3 * rand_srcStride,
> rand_srcStride,
> - IPF_C_output_p,
> + IPF_vec_output_p,
> rand_dstStride,
> rand_width,
> rand_height, rand_val
> diff -r 5f7b3d06d94c -r 41425f18efe1 source/test/testbench.cpp
> --- a/source/test/testbench.cpp Mon Oct 28 22:23:29 2013 +0800
> +++ b/source/test/testbench.cpp Mon Oct 28 22:23:54 2013 +0800
> @@ -74,7 +74,7 @@
> }
> }
>
> - int seed = (int)time(NULL);
> + int seed = 0x526E629B;//(int)time(NULL);
> const char *bpp[] = { "8bpp", "16bpp" };
> printf("Using random seed %X %s\n", seed, bpp[HIGH_BIT_DEPTH]);
> srand(seed);
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
_______________________________________________
x265-devel mailing list
x265-devel at videolan.org
https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131029/74506088/attachment.html>
More information about the x265-devel
mailing list