[x265] [PATCH] asm: sse4 10bit code for sign primitive

Rajesh Paulraj rajesh at multicorewareinc.com
Thu Jun 25 12:33:13 CEST 2015


Sorry . Both are same primitive. I will correct it and resend the two
patches.

On Thu, Jun 25, 2015 at 3:34 PM, Deepthi Nandakumar <
deepthi at multicorewareinc.com> wrote:

>
>
> On Thu, Jun 25, 2015 at 2:19 PM, <rajesh at multicorewareinc.com> wrote:
>
>> # HG changeset patch
>> # User Rajesh Paulraj<rajesh at multicorewareinc.com>
>> # Date 1435219198 -19800
>> #      Thu Jun 25 13:29:58 2015 +0530
>> # Node ID a03487d6295cf89b065eff36e5c1ec4ee4253243
>> # Parent  b1af4c36f48a4500a4912373ebcda9a5540b5c15
>> asm: sse4 10bit code for sign primitive
>>
>>      calSign  6.16x    356.91          2197.63
>>
>> diff -r b1af4c36f48a -r a03487d6295c source/common/x86/asm-primitives.cpp
>> --- a/source/common/x86/asm-primitives.cpp      Wed Jun 24 10:36:15 2015
>> -0500
>> +++ b/source/common/x86/asm-primitives.cpp      Thu Jun 25 13:29:58 2015
>> +0530
>> @@ -1097,6 +1097,7 @@
>>          p.saoCuOrgE3[0] = PFX(saoCuOrgE3_sse4);
>>          p.saoCuOrgE3[1] = PFX(saoCuOrgE3_sse4);
>>          p.saoCuOrgB0 = PFX(saoCuOrgB0_sse4);
>> +        p.sign = x265_calculateSign_sse4;
>>
>> This should be PFX().
>
>
>>          LUMA_ADDAVG(sse4);
>>          CHROMA_420_ADDAVG(sse4);
>> diff -r b1af4c36f48a -r a03487d6295c source/common/x86/loopfilter.asm
>> --- a/source/common/x86/loopfilter.asm  Wed Jun 24 10:36:15 2015 -0500
>> +++ b/source/common/x86/loopfilter.asm  Thu Jun 25 13:29:58 2015 +0530
>> @@ -40,6 +40,7 @@
>>  cextern pw_2
>>  cextern pw_1023
>>  cextern pb_movemask
>> +cextern pw_1
>>
>>
>>
>>  ;============================================================================================================
>> @@ -1419,3 +1420,49 @@
>>
>>  .end:
>>      RET
>> +
>>
>> +;-----------------------------------------------------------------------------
>> +; void calSign(int8_t *dst, const pixel *src1, const pixel *src2, const
>> int endX)
>>
>> +;-----------------------------------------------------------------------------
>> +%if HIGH_BIT_DEPTH
>> +INIT_XMM sse4
>> +cglobal calculateSign, 4, 7, 5
>> +    mova            m0, [pw_1]
>> +    mov             r4d, r3d
>> +    shr             r3d, 4
>> +    add             r3d, 1
>> +    mov             r5, r0
>> +    movu            m4, [r0 + r4]
>> +.loop
>> +    movu            m1, [r1]        ; m2 = pRec[x]
>> +    movu            m2, [r2]        ; m3 = pTmpU[x]
>> +
>> +    pcmpgtw         m3, m1, m2
>> +    pcmpgtw         m2, m1
>> +    pand            m3, m0
>> +    por             m3, m2
>> +    packsswb        m3, m3
>> +    movh            [r0], xm3
>> +
>> +    movu            m1, [r1 + 16]   ; m2 = pRec[x]
>> +    movu            m2, [r2 + 16]   ; m3 = pTmpU[x]
>> +
>> +    pcmpgtw         m3, m1, m2
>> +    pcmpgtw         m2, m1
>> +    pand            m3, m0
>> +    por             m3, m2
>> +    packsswb        m3, m3
>> +    movh            [r0 + 8], xm3
>> +
>> +    add             r0, 16
>> +    add             r1, 32
>> +    add             r2, 32
>> +    dec             r3d
>> +    jnz             .loop
>> +
>> +    mov             r6, r0
>> +    sub             r6, r5
>> +    sub             r4, r6
>> +    movu            [r0 + r4], m4
>> +    RET
>> +%endif
>> diff -r b1af4c36f48a -r a03487d6295c source/common/x86/loopfilter.h
>> --- a/source/common/x86/loopfilter.h    Wed Jun 24 10:36:15 2015 -0500
>> +++ b/source/common/x86/loopfilter.h    Thu Jun 25 13:29:58 2015 +0530
>> @@ -37,7 +37,8 @@
>>      void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int
>> ctuWidth, int ctuHeight, intptr_t stride); \
>>      void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec,
>> intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY,
>> int32_t *stats, int32_t *count); \
>>      void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec,
>> intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats,
>> int32_t *count); \
>> -    void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const
>> pixel *src2, const int endX);
>> +    void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const
>> pixel *src2, const int endX); \
>> +    void PFX(calculateSign_ ## cpu)(int8_t *dst, const pixel *src1,
>> const pixel *src2, const int endX);
>>
>> Whats the difference between calculateSign_ and calSign_? They have the
> same function signature and are assigned to the same primitive?
>
>
>>  DECL_SAO(sse4);
>>  DECL_SAO(avx2);
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150625/0be8f139/attachment-0001.html>


More information about the x265-devel mailing list