[x265] [PATCH 1 of 3] asm: general calSign to accelerate sao

Steve Borho steve at borho.org
Fri Apr 3 17:48:37 CEST 2015


On 04/03, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1428059407 -28800
> # Node ID abf20efa2234fb7cd6a474d4dac6e3051a94b30c
> # Parent  9a5fa67583feb6ffb7668f82632f7e93e5ec9415
> asm: general calSign to accelerate sao

queued with some adjustments for Dnyaneshwar's cleanup of const-a.asm

> ---
>  source/common/x86/const-a.asm    |    3 ++
>  source/common/x86/loopfilter.asm |   69 ++++++++++++++++++++++++++-----------
>  source/encoder/sao.cpp           |   14 ++------
>  source/test/pixelharness.cpp     |    8 ++--
>  4 files changed, 58 insertions(+), 36 deletions(-)
> 
> diff -r 9a5fa67583fe -r abf20efa2234 source/common/x86/const-a.asm
> --- a/source/common/x86/const-a.asm	Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/common/x86/const-a.asm	Fri Apr 03 19:10:07 2015 +0800
> @@ -65,6 +65,9 @@
>  const pb_32,       times 32 db 32
>  const pb_128,      times 16 db 128
>  const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
> +const pb_movemask, times 16 db 0x00
> +                   times 16 db 0xFF
> +                   
>  
>  const pw_0_15,     times 2 dw 0, 1, 2, 3, 4, 5, 6, 7
>  const pw_2,        times 8 dw 2
> diff -r 9a5fa67583fe -r abf20efa2234 source/common/x86/loopfilter.asm
> --- a/source/common/x86/loopfilter.asm	Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/common/x86/loopfilter.asm	Fri Apr 03 19:10:07 2015 +0800
> @@ -36,6 +36,7 @@
>  cextern pb_128
>  cextern pb_2
>  cextern pw_2
> +cextern pb_movemask
>  
>  
>  ;============================================================================================================
> @@ -321,29 +322,55 @@
>      RET
>  
>  ;============================================================================================================
> -; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int endX)
> +; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int width)
>  ;============================================================================================================
>  INIT_XMM sse4
> -cglobal calSign, 4, 4, 6
> -    mova        m1,    [pb_128]
> -    mova        m0,    [pb_1]
> -    shr         r3d,   4
> -.loop
> -    movu        m2,    [r1]        ; m2 = pRec[x]
> -    movu        m3,    [r2]        ; m3 = pTmpU[x]
> +cglobal calSign, 4,5,6
> +    mova        m0,     [pb_128]
> +    mova        m1,     [pb_1]
>  
> -    pxor        m4,    m2,    m1
> -    pxor        m3,    m1
> -    pcmpgtb     m5,    m4,    m3
> -    pcmpgtb     m3,    m4
> -    pand        m5,    m0
> -    por         m5,    m3
> +    sub         r1,     r0
> +    sub         r2,     r0
>  
> -    movu        [r0],  m5
> +    mov         r4d,    r3d
> +    shr         r3d,    4
> +    jz         .next
> +.loop:
> +    movu        m2,     [r0 + r1]            ; m2 = pRec[x]
> +    movu        m3,     [r0 + r2]            ; m3 = pTmpU[x]
> +    pxor        m4,     m2,     m0
> +    pxor        m3,     m0
> +    pcmpgtb     m5,     m4,     m3
> +    pcmpgtb     m3,     m4
> +    pand        m5,     m1
> +    por         m5,     m3
> +    movu        [r0],   m5
>  
> -    add         r0,    16
> -    add         r1,    16
> -    add         r2,    16
> +    add         r0,     16
>      dec         r3d
>      jnz        .loop
> +
> +    ; process partial
> +.next:
> +    and         r4d, 15
> +    jz         .end
> +
> +    movu        m2,     [r0 + r1]            ; m2 = pRec[x]
> +    movu        m3,     [r0 + r2]            ; m3 = pTmpU[x]
> +    pxor        m4,     m2,     m0
> +    pxor        m3,     m0
> +    pcmpgtb     m5,     m4,     m3
> +    pcmpgtb     m3,     m4
> +    pand        m5,     m1
> +    por         m5,     m3
> +
> +    lea         r3,     [pb_movemask + 16]
> +    sub         r3,     r4
> +    movu        xmm0,   [r3]
> +    movu        m3,     [r0]
> +    pblendvb    m5,     m5,     m3,     xmm0
> +    movu        [r0],   m5
> +
> +.end:
>      RET
> +
> diff -r 9a5fa67583fe -r abf20efa2234 source/encoder/sao.cpp
> --- a/source/encoder/sao.cpp	Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/encoder/sao.cpp	Fri Apr 03 19:10:07 2015 +0800
> @@ -783,13 +783,7 @@
>                  rec += stride;
>              }
>  
> -            if (!(ctuWidth & 15))
> -                primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
> -            else
> -            {
> -                for (x = 0; x < ctuWidth; x++)
> -                    upBuff1[x] = signOf(rec[x] - rec[x - stride]);
> -            }
> +            primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
>  
>              for (y = startY; y < endY; y++)
>              {
> @@ -832,8 +826,7 @@
>                  rec += stride;
>              }
>  
> -            for (x = startX; x < endX; x++)
> -                upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]);
> +            primitives.sign(&upBuff1[startX], &rec[startX], &rec[startX - stride - 1], (endX - startX));
>  
>              for (y = startY; y < endY; y++)
>              {
> @@ -879,8 +872,7 @@
>                  rec += stride;
>              }
>  
> -            for (x = startX - 1; x < endX; x++)
> -                upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]);
> +            primitives.sign(&upBuff1[startX - 1], &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
>  
>              for (y = startY; y < endY; y++)
>              {
> diff -r 9a5fa67583fe -r abf20efa2234 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp	Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/test/pixelharness.cpp	Fri Apr 03 19:10:07 2015 +0800
> @@ -870,8 +870,8 @@
>  
>  bool PixelHarness::check_calSign(sign_t ref, sign_t opt)
>  {
> -    ALIGN_VAR_16(int8_t, ref_dest[64 * 64]);
> -    ALIGN_VAR_16(int8_t, opt_dest[64 * 64]);
> +    ALIGN_VAR_16(int8_t, ref_dest[64 * 2]);
> +    ALIGN_VAR_16(int8_t, opt_dest[64 * 2]);
>  
>      memset(ref_dest, 0xCD, sizeof(ref_dest));
>      memset(opt_dest, 0xCD, sizeof(opt_dest));
> @@ -880,12 +880,12 @@
>  
>      for (int i = 0; i < ITERS; i++)
>      {
> -        int width = 16 * (rand() % 4 + 1);
> +        int width = (rand() % 64) + 1;
>  
>          ref(ref_dest, pbuf2 + j, pbuf3 + j, width);
>          checked(opt, opt_dest, pbuf2 + j, pbuf3 + j, width);
>  
> -        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int8_t)))
> +        if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))
>              return false;
>  
>          reportfail();
> 
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list