[x265] [PATCH] aoCuOrgE2: asm code

Steve Borho steve at borho.org
Tue Jan 6 12:56:07 CET 2015


On 01/06, praveen at multicorewareinc.com wrote:
> # HG changeset patch
> # User Praveen Tiwari
> # Date 1420541256 -19800
> # Node ID 382dc33423b4d18ff7babbe8f97cbba58f77876b
> # Parent  feebd0ecda691aeaf9265c7cb20897169df6866a
> aoCuOrgE2: asm code

queued

> diff -r feebd0ecda69 -r 382dc33423b4 source/common/loopfilter.cpp
> --- a/source/common/loopfilter.cpp	Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/common/loopfilter.cpp	Tue Jan 06 16:17:36 2015 +0530
> @@ -57,6 +57,19 @@
>      }
>  }
>  
> +void processSaoCUE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int width, intptr_t stride)
> +{
> +    int x = 0;
> +    for (x = 0; x < width; x++)
> +    {
> +        int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
> +        int edgeType = signDown + buff1[x] + 2;
> +        bufft[x + 1] = -signDown;
> +        short v = rec[x] + offsetEo[edgeType];
> +        rec[x] = (pixel)(v < 0 ? 0 : (v > (PIXEL_MAX)) ? (PIXEL_MAX) : v);
> +    }
> +}
> +
>  void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
>  {
>      #define SAO_BO_BITS 5
> @@ -81,6 +94,7 @@
>  void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p)
>  {
>      p.saoCuOrgE0 = processSaoCUE0;
> +    p.saoCuOrgE2 = processSaoCUE2;
>      p.saoCuOrgB0 = processSaoCUB0;
>      p.sign = calSign;
>  }
> diff -r feebd0ecda69 -r 382dc33423b4 source/common/primitives.h
> --- a/source/common/primitives.h	Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/common/primitives.h	Tue Jan 06 16:17:36 2015 +0530
> @@ -191,6 +191,7 @@
>  typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
>  
>  typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
> +typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
>  typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
>  typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
>  typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
> @@ -244,6 +245,7 @@
>  
>      sign_t                sign;
>      saoCuOrgE0_t          saoCuOrgE0;
> +    saoCuOrgE2_t          saoCuOrgE2;
>      saoCuOrgB0_t          saoCuOrgB0;
>  
>      downscale_t           frameInitLowres;
> diff -r feebd0ecda69 -r 382dc33423b4 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/common/x86/asm-primitives.cpp	Tue Jan 06 16:17:36 2015 +0530
> @@ -1650,6 +1650,7 @@
>      {
>          p.sign = x265_calSign_sse4;
>          p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
> +        p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
>          p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
>  
>          LUMA_ADDAVG(_sse4);
> diff -r feebd0ecda69 -r 382dc33423b4 source/common/x86/loopfilter.asm
> --- a/source/common/x86/loopfilter.asm	Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/common/x86/loopfilter.asm	Tue Jan 06 16:17:36 2015 +0530
> @@ -35,6 +35,7 @@
>  cextern pb_1
>  cextern pb_128
>  cextern pb_2
> +cextern pw_2
>  
>  
>  ;============================================================================================================
> @@ -85,6 +86,58 @@
>      jnz        .loop
>      RET
>  
> +;======================================================================================================================================================
> +; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride)
> +;======================================================================================================================================================
> +INIT_XMM sse4
> +cglobal saoCuOrgE2, 5, 7, 8, rec, bufft, buff1, offsetEo, lcuWidth
> +
> +    mov         r6,    16
> +    mov         r5d,   r5m
> +    pxor        m0,    m0                      ; m0 = 0
> +    mova        m6,    [pb_2]                  ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
> +    mova        m7,    [pb_128]
> +    shr         r4d,   4
> +    inc         r1q
> +
> +    .loop
> +         movu        m1,    [r0]                    ; m1 = rec[x]
> +         movu        m2,    [r0 + r5 + 1]           ; m2 = rec[x + stride + 1]
> +         pxor        m3,    m1,    m7
> +         pxor        m4,    m2,    m7
> +         pcmpgtb     m2,    m3,    m4
> +         pcmpgtb     m4,    m3
> +         pand        m2,    [pb_1]
> +         por         m2,    m4
> +         movu        m3,    [r2]                    ; m3 = buff1
> +
> +         paddb       m3,    m2
> +         paddb       m3,    m6                      ; m3 = edgeType
> +
> +         movu        m4,    [r3]                    ; m4 = offsetEo
> +         pshufb      m4,    m3
> +
> +         psubb       m3,    m0,    m2
> +         movu        [r1],  m3
> +
> +         pmovzxbw    m2,    m1
> +         punpckhbw   m1,    m0
> +         pmovsxbw    m3,    m4
> +         punpckhbw   m4,    m4
> +         psraw       m4,    8
> +
> +         paddw       m2,    m3
> +         paddw       m1,    m4
> +         packuswb    m2,    m1
> +         movu        [r0],  m2
> +
> +         add         r0,    r6
> +         add         r1,    r6
> +         add         r2,    r6
> +         dec         r4d
> +         jnz         .loop
> +    RET
> +
>  ;=====================================================================================
>  ; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
>  ;=====================================================================================
> diff -r feebd0ecda69 -r 382dc33423b4 source/common/x86/loopfilter.h
> --- a/source/common/x86/loopfilter.h	Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/common/x86/loopfilter.h	Tue Jan 06 16:17:36 2015 +0530
> @@ -26,6 +26,7 @@
>  #define X265_LOOPFILTER_H
>  
>  void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
> +void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
>  void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
>  void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
>  
> diff -r feebd0ecda69 -r 382dc33423b4 source/encoder/sao.cpp
> --- a/source/encoder/sao.cpp	Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/encoder/sao.cpp	Tue Jan 06 16:17:36 2015 +0530
> @@ -385,23 +385,54 @@
>                  upBuff1[x] = signOf(rec[x] - tmpU[x - 1]);
>          }
>  
> -        for (y = startY; y < endY; y++)
> +        if (ctuWidth & 15)
>          {
> -            upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
> -            for (x = startX; x < endX; x++)
> -            {
> -                int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
> -                int edgeType = signDown + upBuff1[x] + 2;
> -                upBufft[x + 1] = -signDown;
> -                rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
> -            }
> +             for (y = startY; y < endY; y++)
> +             {
> +                 upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
> +                 for (x = startX; x < endX; x++)
> +                 {
> +                     int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
> +                     int edgeType = signDown + upBuff1[x] + 2;
> +                     upBufft[x + 1] = -signDown;
> +                     rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
> +                 }
>  
> -            std::swap(upBuff1, upBufft);
> +                 std::swap(upBuff1, upBufft);
>  
> -            rec += stride;
> +                 rec += stride;
> +             }
>          }
> +         else
> +         {
> +             for (y = startY; y < endY; y++)
> +             {
> +                 int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]);
> +                 pixel firstPxl = rec[0];  // copy first Pxl
> +                 pixel lastPxl = rec[ctuWidth - 1];
> +                 int8_t one = upBufft[1];
> +                 int8_t two = upBufft[endX + 1];
>  
> -        break;
> +                 primitives.saoCuOrgE2(rec, upBufft, upBuff1, m_offsetEo, ctuWidth, stride);
> +                 if (!lpelx)
> +                 {
> +                     rec[0] = firstPxl;
> +                     upBufft[1] = one;
> +                 }
> +
> +                 if (rpelx == picWidth)
> +                 {
> +                     rec[ctuWidth - 1] = lastPxl;
> +                     upBufft[endX + 1] = two;
> +                 }
> +
> +                 upBufft[startX] = iSignDown2;
> +
> +                 std::swap(upBuff1, upBufft);
> +                 rec += stride;
> +             }
> +         }
> +         break;
>      }
>      case SAO_EO_3: // dir: 45
>      {
> diff -r feebd0ecda69 -r 382dc33423b4 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp	Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/test/pixelharness.cpp	Tue Jan 06 16:17:36 2015 +0530
> @@ -65,7 +65,9 @@
>          sbuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
>          sbuf2[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
>          ibuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1;
> -        psbuf1[i] = (rand() % 65) - 32;                   // range is between -32 to 32
> +        psbuf1[i] = psbuf4[i] = (rand() % 65) - 32;                   // range is between -32 to 32
> +        psbuf2[i] = (rand() % 3) - 1;
> +        psbuf3[i] = (rand() % 129) - 128;
>          sbuf3[i] = rand() % PIXEL_MAX; // for blockcopy only
>      }
>  }
> @@ -917,6 +919,37 @@
>      return true;
>  }
>  
> +bool PixelHarness::check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt)
> +{
> +    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> +    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
> +
> +    memset(ref_dest, 0xCD, sizeof(ref_dest));
> +    memset(opt_dest, 0xCD, sizeof(opt_dest));
> +
> +    int j = 0;
> +
> +    for (int i = 0; i < ITERS; i++)
> +    {
> +        int width = 16 * (rand() % 4 + 1);
> +        int stride = width + 1;
> +
> +        ref(ref_dest, psbuf1 + j, psbuf2 + j, psbuf3 + j, width, stride);
> +        checked(opt, opt_dest, psbuf4 + j, psbuf2 + j, psbuf3 + j, width, stride);
> +
> +        if (memcmp(psbuf1 + j, psbuf4 + j, width * sizeof(int8_t)))
> +            return false;
> +
> +        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
> +            return false;
> +
> +        reportfail();
> +        j += INCR;
> +    }
> +
> +    return true;
> +}
> +
>  bool PixelHarness::check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt)
>  {
>      ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> @@ -1463,6 +1496,15 @@
>          }
>      }
>  
> +    if (opt.saoCuOrgE2)
> +    {
> +        if (!check_saoCuOrgE2_t(ref.saoCuOrgE2, opt.saoCuOrgE2))
> +        {
> +            printf("SAO_EO_2 failed\n");
> +            return false;
> +        }
> +    }
> +
>      if (opt.saoCuOrgB0)
>      {
>          if (!check_saoCuOrgB0_t(ref.saoCuOrgB0, opt.saoCuOrgB0))
> @@ -1801,6 +1843,12 @@
>          REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, 1);
>      }
>  
> +    if (opt.saoCuOrgE2)
> +    {
> +        HEADER0("SAO_EO_2");
> +        REPORT_SPEEDUP(opt.saoCuOrgE2, ref.saoCuOrgE2, pbuf1, psbuf1, psbuf2, psbuf3, 64, 64);
> +    }
> +
>      if (opt.saoCuOrgB0)
>      {
>          HEADER0("SAO_BO_0");
> diff -r feebd0ecda69 -r 382dc33423b4 source/test/pixelharness.h
> --- a/source/test/pixelharness.h	Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/test/pixelharness.h	Tue Jan 06 16:17:36 2015 +0530
> @@ -47,6 +47,9 @@
>      pixel    pbuf4[BUFFSIZE];
>      int      ibuf1[BUFFSIZE];
>      int8_t   psbuf1[BUFFSIZE];
> +    int8_t   psbuf2[BUFFSIZE];
> +    int8_t   psbuf3[BUFFSIZE];
> +    int8_t   psbuf4[BUFFSIZE];
>  
>      int16_t  sbuf1[BUFFSIZE];
>      int16_t  sbuf2[BUFFSIZE];
> @@ -90,6 +93,7 @@
>      bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
>      bool check_addAvg(addAvg_t, addAvg_t);
>      bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
> +    bool check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt);
>      bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
>      bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
>      bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list