[x265] [PATCH Review only] asm: code for pixel_var_8xN
Steve Borho
steve at borho.org
Mon Nov 25 21:07:00 CET 2013
On Nov 25, 2013, at 7:38 AM, murugan at multicorewareinc.com wrote:
> # HG changeset patch
> # User Murugan Vairavel <murugan at multicorewareinc.com>
> # Date 1385386658 -19800
> # Mon Nov 25 19:07:38 2013 +0530
> # Node ID deb2fc2dcaf24a86132ebfe0fbaac4859611c92f
> # Parent 43da6ca15a61e18d033931ca58940d6794f6f8f8
> asm: code for pixel_var_8xN
I'm not sure the encoder uses any variance block measurements other than 8x8
>
> diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/pixel.cpp
> --- a/source/common/pixel.cpp Mon Nov 25 18:46:28 2013 +0530
> +++ b/source/common/pixel.cpp Mon Nov 25 19:07:38 2013 +0530
> @@ -968,8 +968,11 @@
> p.ssim_4x4x2_core = ssim_4x4x2_core;
> p.ssim_end_4 = ssim_end_4;
>
> - p.var[LUMA_16x16] = pixel_var<16, 16>;
> + p.var[LUMA_8x4] = pixel_var<8, 4>;
> p.var[LUMA_8x8] = pixel_var<8, 8>;
> + p.var[LUMA_8x16] = pixel_var<8, 16>;
> + p.var[LUMA_8x32] = pixel_var<8, 32>;
> +
> p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
> }
> }
> diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Mon Nov 25 18:46:28 2013 +0530
> +++ b/source/common/x86/asm-primitives.cpp Mon Nov 25 19:07:38 2013 +0530
> @@ -412,6 +412,15 @@
> SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 16, cpu); \
> SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 64, cpu);
>
> +#define SETUP_PIXEL_VAR_DEF(W, H, cpu) \
> + p.var[LUMA_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;
> +
> +#define LUMA_VAR(cpu) \
> + SETUP_PIXEL_VAR_DEF(8, 4, cpu); \
> + SETUP_PIXEL_VAR_DEF(8, 8, cpu); \
> + SETUP_PIXEL_VAR_DEF(8, 16, cpu); \
> + SETUP_PIXEL_VAR_DEF(8, 32, cpu);
> +
> namespace x265 {
> // private x265 namespace
>
> @@ -442,6 +451,8 @@
> PIXEL_AVG(sse2);
> PIXEL_AVG_W4(mmx2);
>
> + LUMA_VAR(_sse2);
> +
> p.sad[LUMA_8x32] = x265_pixel_sad_8x32_sse2;
> p.sad[LUMA_16x4] = x265_pixel_sad_16x4_sse2;
> p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2;
> diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm Mon Nov 25 18:46:28 2013 +0530
> +++ b/source/common/x86/pixel-a.asm Mon Nov 25 19:07:38 2013 +0530
> @@ -1301,6 +1301,106 @@
>
> %if HIGH_BIT_DEPTH == 0
> %macro VAR 0
> +cglobal pixel_var_8x4, 2,3,8
> + VAR_START 1
> + lea r2, [r1 * 3]
> + movh m0, [r0]
> + movh m3, [r0 + r1]
> + movhps m0, [r0 + r1 * 2]
> + movhps m3, [r0 + r2]
> + DEINTB 1, 0, 4, 3, 7
> + lea r0, [r0 + r1 * 4]
> + VAR_CORE
> + VAR_END 8, 4
> +
> +cglobal pixel_var_8x8, 2,3,8
> + VAR_START 1
> + lea r2, [r1 * 3]
> + movh m0, [r0]
> + movh m3, [r0 + r1]
> + movhps m0, [r0 + r1 * 2]
> + movhps m3, [r0 + r2]
> + DEINTB 1, 0, 4, 3, 7
> + lea r0, [r0 + r1 * 4]
> + VAR_CORE
> + movh m0, [r0]
> + movh m3, [r0 + r1]
> + movhps m0, [r0 + r1 * 2]
> + movhps m3, [r0 + r2]
> + DEINTB 1, 0, 4, 3, 7
> + VAR_CORE
> + VAR_END 8, 8
> +
> +
> +cglobal pixel_var_8x16, 2,4,8
> + VAR_START 1
> + lea r2, [r1 * 3]
> + movh m0, [r0]
> + movh m3, [r0 + r1]
> + movhps m0, [r0 + r1 * 2]
> + movhps m3, [r0 + r2]
> + DEINTB 1, 0, 4, 3, 7
> + lea r0, [r0 + r1 * 4]
> + VAR_CORE
> + movh m0, [r0]
> + movh m3, [r0 + r1]
> + movhps m0, [r0 + r1 * 2]
> + movhps m3, [r0 + r2]
> + DEINTB 1, 0, 4, 3, 7
> + lea r0, [r0 + r1 * 4]
> + VAR_CORE
> + movh m0, [r0]
> + movh m3, [r0 + r1]
> + movhps m0, [r0 + r1 * 2]
> + movhps m3, [r0 + r2]
> + DEINTB 1, 0, 4, 3, 7
> + lea r0, [r0 + r1 * 4]
> + VAR_CORE
> + movh m0, [r0]
> + movh m3, [r0 + r1]
> + movhps m0, [r0 + r1 * 2]
> + movhps m3, [r0 + r2]
> + DEINTB 1, 0, 4, 3, 7
> + VAR_CORE
> + VAR_END 8, 16
> +
> +cglobal pixel_var_8x32, 2,4,8
> + VAR_START 1
> + mov r2d, 2
> + lea r3, [r1 * 3]
> +.loop:
> + movh m0, [r0]
> + movh m3, [r0 + r1]
> + movhps m0, [r0 + r1 * 2]
> + movhps m3, [r0 + r3]
> + DEINTB 1, 0, 4, 3, 7
> + lea r0, [r0 + r1 * 4]
> + VAR_CORE
> + movh m0, [r0]
> + movh m3, [r0 + r1]
> + movhps m0, [r0 + r1 * 2]
> + movhps m3, [r0 + r3]
> + DEINTB 1, 0, 4, 3, 7
> + lea r0, [r0 + r1 * 4]
> + VAR_CORE
> + movh m0, [r0]
> + movh m3, [r0 + r1]
> + movhps m0, [r0 + r1 * 2]
> + movhps m3, [r0 + r3]
> + DEINTB 1, 0, 4, 3, 7
> + lea r0, [r0 + r1 * 4]
> + VAR_CORE
> + movh m0, [r0]
> + movh m3, [r0 + r1]
> + movhps m0, [r0 + r1 * 2]
> + movhps m3, [r0 + r3]
> + DEINTB 1, 0, 4, 3, 7
> + lea r0, [r0 + r1 * 4]
> + VAR_CORE
> + dec r2d
> + jnz .loop
> + VAR_END 8, 32
> +
> cglobal pixel_var_16x16, 2,3,8
> VAR_START 1
> mov r2d, 8
> @@ -1313,38 +1413,6 @@
> dec r2d
> jg .loop
> VAR_END 16, 16
> -
> -cglobal pixel_var_8x8, 2,4,8
> - VAR_START 1
> - mov r2d, 2
> - lea r3, [r1*3]
> -.loop:
> - movh m0, [r0]
> - movh m3, [r0+r1]
> - movhps m0, [r0+r1*2]
> - movhps m3, [r0+r3]
> - DEINTB 1, 0, 4, 3, 7
> - lea r0, [r0+r1*4]
> - VAR_CORE
> - dec r2d
> - jg .loop
> - VAR_END 8, 8
> -
> -cglobal pixel_var_8x16, 2,4,8
> - VAR_START 1
> - mov r2d, 4
> - lea r3, [r1*3]
> -.loop:
> - movh m0, [r0]
> - movh m3, [r0+r1]
> - movhps m0, [r0+r1*2]
> - movhps m3, [r0+r3]
> - DEINTB 1, 0, 4, 3, 7
> - lea r0, [r0+r1*4]
> - VAR_CORE
> - dec r2d
> - jg .loop
> - VAR_END 8, 16
> %endmacro ; VAR
>
> INIT_XMM sse2
> diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h Mon Nov 25 18:46:28 2013 +0530
> +++ b/source/common/x86/pixel.h Mon Nov 25 19:07:38 2013 +0530
> @@ -347,6 +347,17 @@
> CHROMA_PIXELSUB_DEF(_sse4);
> LUMA_PIXELSUB_DEF(_sse4);
>
> +#define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \
> + uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(pixel *pix, intptr_t pixstride);
> +
> +#define LUMA_PIXELVAR_DEF(cpu) \
> + SETUP_LUMA_PIXELVAR_FUNC(8, 4, cpu); \
> + SETUP_LUMA_PIXELVAR_FUNC(8, 8, cpu); \
> + SETUP_LUMA_PIXELVAR_FUNC(8, 16, cpu); \
> + SETUP_LUMA_PIXELVAR_FUNC(8, 32, cpu);
> +
> +LUMA_PIXELVAR_DEF(_sse2);
> +
> #undef DECL_PIXELS
> #undef DECL_SUF
> #undef DECL_HEVC_SSD
> @@ -357,6 +368,8 @@
> #undef SETUP_LUMA_PIXELSUB_PS_FUNC
> #undef CHROMA_PIXELSUB_DEF
> #undef LUMA_PIXELSUB_DEF
> +#undef LUMA_PIXELVAR_DEF
> +#undef SETUP_LUMA_PIXELVAR_FUNC
>
> void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 842 bytes
Desc: Message signed with OpenPGP using GPGMail
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131125/a93d3c2c/attachment.sig>
More information about the x265-devel
mailing list