[x265] [PATCH Review only] asm: 10bpp code for transpose 4x4, 8x8 and 16x16 blocks
chen
chenm003 at 163.com
Tue Dec 3 16:15:36 CET 2013
I verify your 4x4 and 8x8 code, it is right, I pushed it, may you make a new 16x16 patch?
At 2013-12-03 23:02:09,murugan at multicorewareinc.com wrote:
># HG changeset patch
># User Murugan Vairavel <murugan at multicorewareinc.com>
># Date 1386082906 -19800
># Tue Dec 03 20:31:46 2013 +0530
># Node ID 99134096118bff621f56949e3922cd3f53afdf10
># Parent 126f3aefc79dad37e7985953c404ccff370d2729
>asm: 10bpp code for transpose 4x4, 8x8 and 16x16 blocks
>
>diff -r 126f3aefc79d -r 99134096118b source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Tue Dec 03 18:33:13 2013 +0530
>+++ b/source/common/x86/asm-primitives.cpp Tue Dec 03 20:31:46 2013 +0530
>@@ -520,6 +520,10 @@
> p.sse_ss[LUMA_32x32] = x265_pixel_ssd_ss_32x32_sse2;
> p.sse_ss[LUMA_32x64] = x265_pixel_ssd_ss_32x64_sse2;
>
>+ p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
>+ p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
>+ p.transpose[BLOCK_16x16] = x265_transpose16_sse2;
>+
> p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
> PIXEL_AVG(sse2);
> PIXEL_AVG_W4(mmx2);
>diff -r 126f3aefc79d -r 99134096118b source/common/x86/pixel-util8.asm
>--- a/source/common/x86/pixel-util8.asm Tue Dec 03 18:33:13 2013 +0530
>+++ b/source/common/x86/pixel-util8.asm Tue Dec 03 20:31:46 2013 +0530
>@@ -830,7 +830,20 @@
> ;-----------------------------------------------------------------
> INIT_XMM sse2
> cglobal transpose4, 3, 3, 4, dest, src, stride
>-
>+%if HIGH_BIT_DEPTH
>+ add r2, r2
>+ movh m0, [r1]
>+ movh m1, [r1 + r2]
>+ movh m2, [r1 + 2 * r2]
>+ lea r1, [r1 + 2 * r2]
>+ movh m3, [r1 + r2]
>+ punpcklwd m0, m1
>+ punpcklwd m2, m3
>+ punpckhdq m1, m0, m2
>+ punpckldq m0, m2
>+ movu [r0], m0
>+ movu [r0 + 16], m1
>+%else
> movd m0, [r1]
> movd m1, [r1 + r2]
> movd m2, [r1 + 2 * r2]
>@@ -841,15 +854,88 @@
> punpcklbw m2, m3
> punpcklwd m0, m2
> movu [r0], m0
>-
>+%endif
> RET
>
> ;-----------------------------------------------------------------
> ; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride)
> ;-----------------------------------------------------------------
> INIT_XMM sse2
>-cglobal transpose8, 3, 3, 8, dest, src, stride
>-
>+%if HIGH_BIT_DEPTH
>+%macro TRANSPOSE_4x4 1
>+ movh m0, [r1]
>+ movh m1, [r1 + r2]
>+ movh m2, [r1 + 2 * r2]
>+ lea r1, [r1 + 2 * r2]
>+ movh m3, [r1 + r2]
>+ punpcklwd m0, m1
>+ punpcklwd m2, m3
>+ punpckhdq m1, m0, m2
>+ punpckldq m0, m2
>+ movlps [r0], m0
>+ movhps [r0 + %1], m0
>+ movlps [r0 + 2 * %1], m1
>+ lea r0, [r0 + 2 * %1]
>+ movhps [r0 + %1], m1
>+%endmacro
>+cglobal transpose8_internal
>+ TRANSPOSE_4x4 r5
>+ lea r1, [r1 + 2 * r2]
>+ lea r0, [r3 + 8]
>+ TRANSPOSE_4x4 r5
>+ lea r1, [r1 + 2 * r2]
>+ neg r2
>+ lea r1, [r1 + r2 * 8 + 8]
>+ neg r2
>+ lea r0, [r3 + 4 * r5]
>+ TRANSPOSE_4x4 r5
>+ lea r1, [r1 + 2 * r2]
>+ lea r0, [r3 + 8 + 4 * r5]
>+ TRANSPOSE_4x4 r5
>+ ret
>+cglobal transpose8, 3, 6, 4, dest, src, stride
>+ add r2, r2
>+ mov r3, r0
>+ mov r4, r1
>+ mov r5, 16
>+ call transpose8_internal
>+ ret
>+%else
>+cglobal transpose8, 3, 5, 8, dest, src, stride
>+ lea r3, [2 * r2]
>+ lea r4, [3 * r2]
>+ movh m0, [r1]
>+ movh m1, [r1 + r2]
>+ movh m2, [r1 + r3]
>+ movh m3, [r1 + r4]
>+ movh m4, [r1 + 4 * r2]
>+ lea r1, [r1 + 4 * r2]
>+ movh m5, [r1 + r2]
>+ movh m6, [r1 + r3]
>+ movh m7, [r1 + r4]
>+
>+ punpcklbw m0, m1
>+ punpcklbw m2, m3
>+ punpcklbw m4, m5
>+ punpcklbw m6, m7
>+
>+ punpckhwd m1, m0, m2
>+ punpcklwd m0, m2
>+ punpckhwd m5, m4, m6
>+ punpcklwd m4, m6
>+ punpckhdq m2, m0, m4
>+ punpckldq m0, m4
>+ punpckhdq m3, m1, m5
>+ punpckldq m1, m5
>+
>+ movu [r0], m0
>+ movu [r0 + 16], m2
>+ movu [r0 + 32], m1
>+ movu [r0 + 48], m3
>+%endif
>+ RET
>+
>+%macro TRANSPOSE_8x8 1
> movh m0, [r1]
> movh m1, [r1 + r2]
> movh m2, [r1 + 2 * r2]
>@@ -866,42 +952,6 @@
> punpcklbw m2, m3
> punpcklbw m4, m5
> punpcklbw m6, m7
>-
>- punpckhwd m1, m0, m2
>- punpcklwd m0, m2
>- punpckhwd m5, m4, m6
>- punpcklwd m4, m6
>- punpckhdq m2, m0, m4
>- punpckldq m0, m4
>- punpckhdq m3, m1, m5
>- punpckldq m1, m5
>-
>- movu [r0], m0
>- movu [r0 + 16], m2
>- movu [r0 + 32], m1
>- movu [r0 + 48], m3
>-
>- RET
>-
>-%macro TRANSPOSE_8x8 1
>-
>- movh m0, [r1]
>- movh m1, [r1 + r2]
>- movh m2, [r1 + 2 * r2]
>- lea r1, [r1 + 2 * r2]
>- movh m3, [r1 + r2]
>- movh m4, [r1 + 2 * r2]
>- lea r1, [r1 + 2 * r2]
>- movh m5, [r1 + r2]
>- movh m6, [r1 + 2 * r2]
>- lea r1, [r1 + 2 * r2]
>- movh m7, [r1 + r2]
>-
>- punpcklbw m0, m1
>- punpcklbw m2, m3
>- punpcklbw m4, m5
>- punpcklbw m6, m7
>-
> punpckhwd m1, m0, m2
> punpcklwd m0, m2
> punpckhwd m5, m4, m6
>@@ -922,14 +972,33 @@
> movlps [r0 + 2 * %1], m3
> lea r0, [r0 + 2 * %1]
> movhps [r0 + %1], m3
>-
> %endmacro
>
>-
> ;-----------------------------------------------------------------
> ; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride)
> ;-----------------------------------------------------------------
> INIT_XMM sse2
>+%if HIGH_BIT_DEPTH
>+cglobal transpose16, 3, 7, 4, dest, src, stride
>+ add r2, r2
>+ mov r3, r0
>+ mov r4, r1
>+ mov r5, 32
>+ mov r6, r0
>+ call transpose8_internal
>+ lea r1, [r1 - 8 + 2 * r2]
>+ lea r0, [r6 + 16]
>+ mov r3, r0
>+ call transpose8_internal
>+ lea r1, [r4 + 16]
>+ lea r0, [r6 + 8 * 32]
>+ mov r3, r0
>+ call transpose8_internal
>+ lea r1, [r1 - 8 + 2 * r2]
>+ lea r0, [r6 + 8 * 32 + 16]
>+ mov r3, r0
>+ call transpose8_internal
>+%else
> cglobal transpose16, 3, 5, 8, dest, src, stride
>
> mov r3, r0
>@@ -944,7 +1013,7 @@
> lea r1, [r1 + 2 * r2]
> lea r0, [r3 + 8 * 16 + 8]
> TRANSPOSE_8x8 16
>-
>+%endif
> RET
>
> cglobal transpose16_internal
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131203/5661f481/attachment-0001.html>
More information about the x265-devel
mailing list