[x265] [PATCH Review only] asm: 10bpp code for transpose 4x4 and 8x8
Murugan Vairavel
murugan at multicorewareinc.com
Tue Dec 3 15:49:49 CET 2013
Ignore this patch. Need modifications for 16x16.
On Tue, Dec 3, 2013 at 7:08 PM, <murugan at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Murugan Vairavel <murugan at multicorewareinc.com>
> # Date 1386077908 -19800
> # Tue Dec 03 19:08:28 2013 +0530
> # Node ID 1ae4e8ae04d0792db6590a62272990d83f49a265
> # Parent 126f3aefc79dad37e7985953c404ccff370d2729
> asm: 10bpp code for transpose 4x4 and 8x8
>
> diff -r 126f3aefc79d -r 1ae4e8ae04d0 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Tue Dec 03 18:33:13 2013
> +0530
> +++ b/source/common/x86/asm-primitives.cpp Tue Dec 03 19:08:28 2013
> +0530
> @@ -520,6 +520,9 @@
> p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_sse2;
> p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_sse2;
>
> + p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
> + p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
> +
> p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
> PIXEL_AVG(sse2);
> PIXEL_AVG_W4(mmx2);
> diff -r 126f3aefc79d -r 1ae4e8ae04d0 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Tue Dec 03 18:33:13 2013 +0530
> +++ b/source/common/x86/pixel-util8.asm Tue Dec 03 19:08:28 2013 +0530
> @@ -830,7 +830,20 @@
> ;-----------------------------------------------------------------
> INIT_XMM sse2
> cglobal transpose4, 3, 3, 4, dest, src, stride
> -
> +%if HIGH_BIT_DEPTH
> + add r2, r2
> + movh m0, [r1]
> + movh m1, [r1 + r2]
> + movh m2, [r1 + 2 * r2]
> + lea r1, [r1 + 2 * r2]
> + movh m3, [r1 + r2]
> + punpcklwd m0, m1
> + punpcklwd m2, m3
> + punpckhdq m1, m0, m2
> + punpckldq m0, m2
> + movu [r0], m0
> + movu [r0 + 16], m1
> +%else
> movd m0, [r1]
> movd m1, [r1 + r2]
> movd m2, [r1 + 2 * r2]
> @@ -841,26 +854,61 @@
> punpcklbw m2, m3
> punpcklwd m0, m2
> movu [r0], m0
> -
> +%endif
> RET
>
> ;-----------------------------------------------------------------
> ; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride)
> ;-----------------------------------------------------------------
> INIT_XMM sse2
> -cglobal transpose8, 3, 3, 8, dest, src, stride
> -
> +%if HIGH_BIT_DEPTH
> +%macro TRANSPOSE_4x4 1
> movh m0, [r1]
> movh m1, [r1 + r2]
> movh m2, [r1 + 2 * r2]
> lea r1, [r1 + 2 * r2]
> movh m3, [r1 + r2]
> - movh m4, [r1 + 2 * r2]
> - lea r1, [r1 + 2 * r2]
> + punpcklwd m0, m1
> + punpcklwd m2, m3
> + punpckhdq m1, m0, m2
> + punpckldq m0, m2
> + movlps [r0], m0
> + movhps [r0 + %1], m0
> + movlps [r0 + 2 * %1], m1
> + lea r0, [r0 + 2 * %1]
> + movhps [r0 + %1], m1
> +%endmacro
> +cglobal transpose8_internal
> + TRANSPOSE_4x4 r5
> + lea r1, [r1 + 2 * r2]
> + lea r0, [r3 + 8]
> + TRANSPOSE_4x4 r5
> + lea r1, [r4 + 8]
> + lea r0, [r3 + 4 * r5]
> + TRANSPOSE_4x4 r5
> + lea r1, [r1 + 2 * r2]
> + lea r0, [r3 + 8 + 4 * r5]
> + TRANSPOSE_4x4 r5
> + ret
> +cglobal transpose8, 3, 6, 4, dest, src, stride
> + add r2, r2
> + mov r3, r0
> + mov r4, r1
> + mov r5, 16
> + call transpose8_internal
> +%else
> +cglobal transpose8, 3, 5, 8, dest, src, stride
> + lea r3, [2 * r2]
> + lea r4, [3 * r2]
> + movh m0, [r1]
> + movh m1, [r1 + r2]
> + movh m2, [r1 + r3]
> + movh m3, [r1 + r4]
> + movh m4, [r1 + 4 * r2]
> + lea r1, [r1 + 4 * r2]
> movh m5, [r1 + r2]
> - movh m6, [r1 + 2 * r2]
> - lea r1, [r1 + 2 * r2]
> - movh m7, [r1 + r2]
> + movh m6, [r1 + r3]
> + movh m7, [r1 + r4]
>
> punpcklbw m0, m1
> punpcklbw m2, m3
> @@ -880,7 +928,7 @@
> movu [r0 + 16], m2
> movu [r0 + 32], m1
> movu [r0 + 48], m3
> -
> +%endif
> RET
>
> %macro TRANSPOSE_8x8 1
>
--
With Regards,
Murugan. V
+919659287478
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131203/9cade489/attachment.html>
More information about the x265-devel
mailing list