[x265] [PATCH Review only] asm: code for transpose_8x8 routine
Steve Borho
steve at borho.org
Tue Nov 19 23:05:05 CET 2013
Sorry, I confused this one with a different patch series. I hadn't taken the 4x4 patch because Min asked for some white-space improvements.
Please send his requested changes as a follow-on patch
On Nov 19, 2013, at 3:59 PM, Steve Borho <steve at borho.org> wrote:
> I can't take this of the 16x16 patch because the 4x4 patch still causes crashes. You'll need to fix the first one and then resubmit these all together.
>
> On Nov 19, 2013, at 12:23 AM, murugan at multicorewareinc.com wrote:
>
>> # HG changeset patch
>> # User Murugan Vairavel <murugan at multicorewareinc.com>
>> # Date 1384842189 -19800
>> # Tue Nov 19 11:53:09 2013 +0530
>> # Node ID 3a94cc365533bf7def255dc5b28e6a6a1d1bfa50
>> # Parent f6a050b79cfa400aa432f49ee8a4c2b9f20cf930
>> asm: code for transpose_8x8 routine
>>
>> diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/asm-primitives.cpp
>> --- a/source/common/x86/asm-primitives.cpp Tue Nov 19 11:25:00 2013 +0530
>> +++ b/source/common/x86/asm-primitives.cpp Tue Nov 19 11:53:09 2013 +0530
>> @@ -546,6 +546,7 @@
>> p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
>> p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
>> p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
>> + p.transpose[BLOCK_8x8] = x265_transpose8_sse2;
>> }
>> if (cpuMask & X265_CPU_SSSE3)
>> {
>> diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/pixel-a.asm
>> --- a/source/common/x86/pixel-a.asm Tue Nov 19 11:25:00 2013 +0530
>> +++ b/source/common/x86/pixel-a.asm Tue Nov 19 11:53:09 2013 +0530
>> @@ -8359,3 +8359,45 @@
>> movu [r0], m0
>>
>> RET
>> +
>> +;-----------------------------------------------------------------
>> +; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride)
>> +;-----------------------------------------------------------------
>> +INIT_XMM sse2
>> +cglobal transpose8, 3, 3, 8, dest, src, stride
>> +
>> + movh m0, [r1]
>> + movh m1, [r1 + r2]
>> + movh m2, [r1 + 2 * r2]
>> + lea r1, [r1 + 2 * r2]
>> + movh m3, [r1 + r2]
>> + movh m4, [r1 + 2 * r2]
>> + lea r1, [r1 + 2 * r2]
>> + movh m5, [r1 + r2]
>> + movh m6, [r1 + 2 * r2]
>> + lea r1, [r1 + 2 * r2]
>> + movh m7, [r1 + r2]
>> +
>> + punpcklbw m0, m1
>> + punpcklbw m2, m3
>> + punpcklbw m4, m5
>> + punpcklbw m6, m7
>> + movu m1, m0
>> + punpcklwd m0, m2
>> + punpckhwd m1, m2
>> + movu m5, m4
>> + punpcklwd m4, m6
>> + punpckhwd m5, m6
>> + movu m2, m0
>> + punpckldq m0, m4
>> + punpckhdq m2, m4
>> + movu m3, m1
>> + punpckldq m1, m5
>> + punpckhdq m3, m5
>> +
>> + movu [r0], m0
>> + movu [r0 + 16], m2
>> + movu [r0 + 32], m1
>> + movu [r0 + 48], m3
>> +
>> + RET
>> diff -r f6a050b79cfa -r 3a94cc365533 source/common/x86/pixel.h
>> --- a/source/common/x86/pixel.h Tue Nov 19 11:25:00 2013 +0530
>> +++ b/source/common/x86/pixel.h Tue Nov 19 11:53:09 2013 +0530
>> @@ -366,5 +366,6 @@
>> void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
>> void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
>> void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride);
>> +void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride);
>>
>> #endif // ifndef X265_I386_PIXEL_H
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 842 bytes
Desc: Message signed with OpenPGP using GPGMail
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131119/0003812c/attachment.sig>
More information about the x265-devel
mailing list