[x265] Fwd: [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp
Praveen Tiwari
praveen at multicorewareinc.com
Mon Jun 29 08:08:18 CEST 2015
You would like to visit 8bpp code as well.
Regards,
Praveen
On Mon, Jun 29, 2015 at 11:24 AM, Rajesh Paulraj <
rajesh at multicorewareinc.com> wrote:
> We don't need to push this patch. I will improve sse version for the same
> size. We may not need avx2 code for this.(will make sure after rewriting
> sse2 code)
>
> On Mon, Jun 29, 2015 at 10:21 AM, Deepthi Nandakumar <
> deepthi at multicorewareinc.com> wrote:
>
>> This does not build for HBD disabled
>>
>> On Fri, Jun 26, 2015 at 5:40 PM, Rajesh Paulraj <
>> rajesh at multicorewareinc.com> wrote:
>>
>>> yes. It looks like we need to optimize sse2 code. I will work on this.
>>>
>>> On Fri, Jun 26, 2015 at 5:31 PM, Praveen Tiwari <
>>> praveen at multicorewareinc.com> wrote:
>>>
>>>>
>>>>
>>>>
>>>> ---------- Forwarded message ----------
>>>> From: <rajesh at multicorewareinc.com>
>>>> Date: Fri, Jun 26, 2015 at 3:14 PM
>>>> Subject: [x265] [PATCH] asm: pixelavg_pp[8xN] avx2 code for 10bpp
>>>> To: x265-devel at videolan.org
>>>>
>>>>
>>>> # HG changeset patch
>>>> # User Rajesh Paulraj<rajesh at multicorewareinc.com>
>>>> # Date 1435311076 -19800
>>>> # Fri Jun 26 15:01:16 2015 +0530
>>>> # Node ID 956401f1a679f1e71181b704d64e4acdb6f1a93f
>>>> # Parent d64227e54233d1646c55bcb4b0b831e5340009ed
>>>> asm: pixelavg_pp[8xN] avx2 code for 10bpp
>>>>
>>>> avx2:
>>>> avg_pp[ 8x4] 4.39x 145.09 636.75
>>>> avg_pp[ 8x8] 5.33x 215.27 1146.55
>>>> avg_pp[ 8x16] 6.50x 336.88 2190.68
>>>> avg_pp[ 8x32] 7.71x 579.86 4470.84
>>>>
>>>> sse2:
>>>> avg_pp[ 8x4] 2.31x 287.63 663.94
>>>> avg_pp[ 8x8] 3.26x 370.21 1205.26
>>>> avg_pp[ 8x16] 3.99x 581.63 2323.25
>>>> avg_pp[ 8x32] 4.78x 995.79 4755.58
>>>>
>>>>
>>>> Basically, our macro "pixel_avg_8xN" just SSE (just simple syntax
>>>> conversion for avx2, not using 256 bit capability) so, fundamentally there
>>>> should be no major improvement in speed. But improvements 287.63c
>>>> -> 145.09c, 370.21c -> 215.27 etc are quite good. Does it means SSE2 codes
>>>> are not optimize well ? Can you revisit SSE code using this algorithm?
>>>>
>>>>
>>>>
>>>> diff -r d64227e54233 -r 956401f1a679
>>>> source/common/x86/asm-primitives.cpp
>>>> --- a/source/common/x86/asm-primitives.cpp Thu Jun 25 16:25:51
>>>> 2015 +0530
>>>> +++ b/source/common/x86/asm-primitives.cpp Fri Jun 26 15:01:16
>>>> 2015 +0530
>>>> @@ -1362,6 +1362,10 @@
>>>> p.cu[BLOCK_32x32].intra_pred[33] =
>>>> PFX(intra_pred_ang32_33_avx2);
>>>> p.cu[BLOCK_32x32].intra_pred[34] =
>>>> PFX(intra_pred_ang32_2_avx2);
>>>>
>>>> + p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx2);
>>>> + p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx2);
>>>> + p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx2);
>>>> + p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_avx2);
>>>> p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2);
>>>> p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
>>>> p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
>>>> diff -r d64227e54233 -r 956401f1a679 source/common/x86/mc-a.asm
>>>> --- a/source/common/x86/mc-a.asm Thu Jun 25 16:25:51 2015 +0530
>>>> +++ b/source/common/x86/mc-a.asm Fri Jun 26 15:01:16 2015 +0530
>>>> @@ -4490,6 +4490,88 @@
>>>> RET
>>>> %endif
>>>>
>>>> +%macro pixel_avg_W8 0
>>>> + movu xm0, [r2]
>>>> + movu xm1, [r4]
>>>> + pavgw xm0, xm1
>>>> + movu [r0], xm0
>>>> + movu xm2, [r2 + r3]
>>>> + movu xm3, [r4 + r5]
>>>> + pavgw xm2, xm3
>>>> + movu [r0 + r1], xm2
>>>> +
>>>> + movu xm0, [r2 + r3 * 2]
>>>> + movu xm1, [r4 + r5 * 2]
>>>> + pavgw xm0, xm1
>>>> + movu [r0 + r1 * 2], xm0
>>>> + movu xm2, [r2 + r6]
>>>> + movu xm3, [r4 + r7]
>>>> + pavgw xm2, xm3
>>>> + movu [r0 + r8], xm2
>>>> +
>>>> + lea r0, [r0 + 4 * r1]
>>>> + lea r2, [r2 + 4 * r3]
>>>> + lea r4, [r4 + 4 * r5]
>>>> +%endmacro
>>>> +
>>>>
>>>> +;-------------------------------------------------------------------------------------------------------------------------------
>>>> +;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0,
>>>> intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
>>>>
>>>> +;-------------------------------------------------------------------------------------------------------------------------------
>>>> +%if ARCH_X86_64
>>>> +INIT_YMM avx2
>>>> +cglobal pixel_avg_8x4, 6,10,4
>>>> + add r1d, r1d
>>>> + add r3d, r3d
>>>> + add r5d, r5d
>>>> + lea r6, [r3 * 3]
>>>> + lea r7, [r5 * 3]
>>>> + lea r8, [r1 * 3]
>>>> + pixel_avg_W8
>>>> + RET
>>>> +
>>>> +cglobal pixel_avg_8x8, 6,10,4
>>>> + add r1d, r1d
>>>> + add r3d, r3d
>>>> + add r5d, r5d
>>>> + lea r6, [r3 * 3]
>>>> + lea r7, [r5 * 3]
>>>> + lea r8, [r1 * 3]
>>>> + mov r9d, 2
>>>> +.loop
>>>> + pixel_avg_W8
>>>> + dec r9d
>>>> + jnz .loop
>>>> + RET
>>>> +
>>>> +cglobal pixel_avg_8x16, 6,10,4
>>>> + add r1d, r1d
>>>> + add r3d, r3d
>>>> + add r5d, r5d
>>>> + lea r6, [r3 * 3]
>>>> + lea r7, [r5 * 3]
>>>> + lea r8, [r1 * 3]
>>>> + mov r9d, 4
>>>> +.loop
>>>> + pixel_avg_W8
>>>> + dec r9d
>>>> + jnz .loop
>>>> + RET
>>>> +
>>>> +cglobal pixel_avg_8x32, 6,10,4
>>>> + add r1d, r1d
>>>> + add r3d, r3d
>>>> + add r5d, r5d
>>>> + lea r6, [r3 * 3]
>>>> + lea r7, [r5 * 3]
>>>> + lea r8, [r1 * 3]
>>>> + mov r9d, 8
>>>> +.loop
>>>> + pixel_avg_W8
>>>> + dec r9d
>>>> + jnz .loop
>>>> + RET
>>>> +%endif
>>>> +
>>>> %macro pixel_avg_H4 0
>>>> movu m0, [r2]
>>>> movu m1, [r4]
>>>> _______________________________________________
>>>> x265-devel mailing list
>>>> x265-devel at videolan.org
>>>> https://mailman.videolan.org/listinfo/x265-devel
>>>>
>>>>
>>>> _______________________________________________
>>>> x265-devel mailing list
>>>> x265-devel at videolan.org
>>>> https://mailman.videolan.org/listinfo/x265-devel
>>>>
>>>>
>>>
>>> _______________________________________________
>>> x265-devel mailing list
>>> x265-devel at videolan.org
>>> https://mailman.videolan.org/listinfo/x265-devel
>>>
>>>
>>
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150629/2bb5dcfc/attachment-0001.html>
More information about the x265-devel
mailing list