[x265] [PATCH] ASM routine for luma filter for 13xN block size

Fri Oct 11 07:06:55 CEST 2013

On Thu, Oct 10, 2013 at 10:52 PM, Steve Borho <steve at borho.org> wrote:

>
>
>
> On Thu, Oct 10, 2013 at 6:08 AM, <nabajit at multicorewareinc.com> wrote:
>
>> # HG changeset patch
>> # User Nabajit Deka
>> # Date 1381403296 -19800
>> #      Thu Oct 10 16:38:16 2013 +0530
>> # Node ID 638f17a6d2de628566ac51d1461d8f94ee33ceeb
>> # Parent  a79ecf3a787577a2e557659c7a8d226d7d41ce00
>> ASM routine for luma filter for 13xN block size.
>>
>
> I thought we had decided to only write assembly for the "real" luma inter
> block sizes and to use the existing intrinsic primitive that takes
> width/height arguments for these odd sizes used by subpel refine; until we
> can figure out if that ME optimization is still useful.
>

--> Oh, I misunderstood your point. But extending the luma filter beyond
the real luma inter block sizes wasn't much of an effort and coding has
been finished for all the sizes.The current functions takes height as an
argument and the width is hard coded to fit the block sizes.

>
>
>>
>> diff -r a79ecf3a7875 -r 638f17a6d2de source/common/x86/ipfilter8.asm
>> --- a/source/common/x86/ipfilter8.asm   Thu Oct 10 12:29:41 2013 +0530
>> +++ b/source/common/x86/ipfilter8.asm   Thu Oct 10 16:38:16 2013 +0530
>> @@ -130,3 +130,71 @@
>>      RET
>>
>>  %endif  ; ARCH_X86_64 == 0
>> +
>> +SECTION_RODATA 32
>> +
>> +tab_Tm8:    db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
>> +            db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10
>> +            db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12
>> +            db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
>> +
>> +tab_c_512:  times 8 dw 512
>> +
>> +SECTION .text
>> +
>> +%macro FILTER_H8 3
>> +    movu        %1, [srcq - 3]
>> +    pshufb      %2, %1, [tab_Tm8]
>> +    pmaddubsw   %2, m3
>> +    pshufb      m7, %1, [tab_Tm8 + 16]
>> +    pmaddubsw   m7, m3
>> +    phaddw      %2, m7
>> +    pshufb      m7, %1, [tab_Tm8 + 32]
>> +    pmaddubsw   m7, m3
>> +    pshufb      %1, %1, [tab_Tm8 + 48]
>> +    pmaddubsw   %1, m3
>> +    phaddw      m7, %1
>> +    phaddw      %2, m7
>> +    pmulhrsw    %2, %3
>> +    packuswb    %2, %2
>> +    movh        [r2], %2
>> +    movu        %1, [srcq - 3 + 8]
>> +    pshufb      %2, %1, [tab_Tm8]
>> +    pmaddubsw   %2, m3
>> +    pshufb      m7, %1, [tab_Tm8 + 16]
>> +    pmaddubsw   m7, m3
>> +    phaddw      %2, m7
>> +    pshufb      m7, %1, [tab_Tm8 + 32]
>> +    pmaddubsw   m7, m3
>> +    phaddw      m7, m7
>> +    phaddw      %2, m7
>> +    pmulhrsw    %2, %3
>> +    packuswb    %2, %2
>> +    pextrd      [r2 + 8], %2, 0
>> +    pextrb      [r2 + 12], %2, 4
>> +%endmacro
>> +
>>
>> +;-----------------------------------------------------------------------------
>> +; void filterHorizontal_p_p_8(pixel *src, intptr_t srcStride, pixel
>> *dst, intptr_t dstStride, int width, int height, short const *coeff)
>>
>> +;-----------------------------------------------------------------------------
>> +INIT_XMM sse4
>> +cglobal filterHorizontal_p_p_8, 4, 5, 5, src, srcStride, dst, dstStride
>>
>
> the function name implies 8xN?
>

--> The "8" in the function name was meant for 8-tap filter. We can add the
block size as suffix to the function name.

>
>
>> +
>> +    mov       r4, r6m
>> +    movu      m3, [r4]
>> +    packsswb  m3, m3
>> +
>> +    mova      m2, [tab_c_512]
>> +
>> +    mov r4,   r5m
>> +
>> +.loop
>> +    FILTER_H8   m0, m1, m2
>> +
>> +    add         srcq, srcStrideq
>> +    add         dstq, dstStrideq
>> +
>> +    dec r4d
>> +    jnz .loop
>> +
>> +    RET
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>
>
>
> --
> Steve Borho
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131011/cd9409a6/attachment.html>