[x265] [PATCH] sao: modify C and SSE4 code for saoCuOrgE0 to process 2 rows

Divya Manivannan divya at multicorewareinc.com
Thu Apr 2 07:01:38 CEST 2015


Thanks Steve. I will change the variable name and resend the patch.

On Thu, Apr 2, 2015 at 1:44 AM, Steve Borho <steve at borho.org> wrote:

> On Wed, Apr 1, 2015 at 8:40 AM, Divya Manivannan
> <divya at multicorewareinc.com> wrote:
> > # HG changeset patch
> > # User Divya Manivannan <divya at multicorewareinc.com>
> > # Date 1427895336 -19800
> > #      Wed Apr 01 19:05:36 2015 +0530
> > # Node ID f718abdc8004d0c859266b292730b7b5b3d0d4df
> > # Parent  ac85c775620f1dcb0df056874633cbf916098bd2
> > sao: modify C and SSE4 code for saoCuOrgE0 to process 2 rows
>
> /home/sborho/repos/x265/source/encoder/sao.cpp: In member function
> 'void x265::SAO::processSaoCu(int, int, int)':
> /home/sborho/repos/x265/source/encoder/sao.cpp:289:21: warning:
> declaration of 'signLeft' shadows a previous local [-Wshadow]
>                  int signLeft = signOf(rec[startX] - tmpL[y]);
>                      ^
> /home/sborho/repos/x265/source/encoder/sao.cpp:261:64: warning:
> shadowed declaration is here [-Wshadow]
>      int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1,
> signLeft[2];
>
>
> > diff -r ac85c775620f -r f718abdc8004 source/common/loopfilter.cpp
> > --- a/source/common/loopfilter.cpp      Tue Mar 31 20:04:28 2015 -0500
> > +++ b/source/common/loopfilter.cpp      Wed Apr 01 19:05:36 2015 +0530
> > @@ -42,18 +42,23 @@
> >          dst[x] = signOf(src1[x] - src2[x]);
> >  }
> >
> > -void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t
> signLeft)
> > +void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t*
> signLeft, intptr_t stride)
> >  {
> > -    int x;
> > -    int8_t signRight;
> > +    int x, y;
> > +    int8_t signRight, signLeft0;
> >      int8_t edgeType;
> >
> > -    for (x = 0; x < width; x++)
> > +    for (y = 0; y < 2; y++)
> >      {
> > -        signRight = ((rec[x] - rec[x + 1]) < 0) ? -1 : ((rec[x] - rec[x
> + 1]) > 0) ? 1 : 0;
> > -        edgeType = signRight + signLeft + 2;
> > -        signLeft  = -signRight;
> > -        rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
> > +        signLeft0 = signLeft[y];
> > +        for (x = 0; x < width; x++)
> > +        {
> > +            signRight = ((rec[x] - rec[x + 1]) < 0) ? -1 : ((rec[x] -
> rec[x + 1]) > 0) ? 1 : 0;
> > +            edgeType = signRight + signLeft0 + 2;
> > +            signLeft0 = -signRight;
> > +            rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
> > +        }
> > +        rec += stride;
> >      }
> >  }
> >
> > diff -r ac85c775620f -r f718abdc8004 source/common/primitives.h
> > --- a/source/common/primitives.h        Tue Mar 31 20:04:28 2015 -0500
> > +++ b/source/common/primitives.h        Wed Apr 01 19:05:36 2015 +0530
> > @@ -169,7 +169,7 @@
> >  typedef void (*pixelavg_pp_t)(pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int
> weight);
> >  typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1,
> pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
> >
> > -typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width,
> int8_t signLeft);
> > +typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width,
> int8_t* signLeft, intptr_t stride);
> >  typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t*
> offsetEo, intptr_t stride, int width);
> >  typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t*
> pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
> >  typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t*
> m_offsetEo, intptr_t stride, int startX, int endX);
> > diff -r ac85c775620f -r f718abdc8004 source/common/x86/loopfilter.asm
> > --- a/source/common/x86/loopfilter.asm  Tue Mar 31 20:04:28 2015 -0500
> > +++ b/source/common/x86/loopfilter.asm  Wed Apr 01 19:05:36 2015 +0530
> > @@ -39,20 +39,25 @@
> >
> >
> >
> ;============================================================================================================
> > -; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t
> signLeft)
> > +; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t*
> signLeft, intptr_t stride)
> >
> ;============================================================================================================
> >  INIT_XMM sse4
> > -cglobal saoCuOrgE0, 4, 4, 8, rec, offsetEo, lcuWidth, signLeft
> > +cglobal saoCuOrgE0, 5, 6, 8, rec, offsetEo, lcuWidth, signLeft, stride
> >
> > -    neg         r3                          ; r3 = -signLeft
> > -    movzx       r3d, r3b
> > -    movd        m0, r3d
> > -    mova        m4, [pb_128]                ; m4 = [80]
> > -    pxor        m5, m5                      ; m5 = 0
> > -    movu        m6, [r1]                    ; m6 = offsetEo
> > +    mov         r4d, r4m
> > +    mova        m4,  [pb_128]                ; m4 = [80]
> > +    pxor        m5,  m5                      ; m5 = 0
> > +    movu        m6,  [r1]                    ; m6 = offsetEo
> > +
> > +    movzx       r5d, byte [r3]
> > +    inc         r3
> > +    neg         r5b
> > +    movd        m0, r5d
> > +    lea         r5, [r0 + r4]
> > +    mov         r4d, r2d
> >
> >  .loop:
> > -    movu        m7, [r0]                    ; m1 = rec[x]
> > +    movu        m7, [r0]                    ; m7 = rec[x]
> >      movu        m2, [r0 + 1]                ; m2 = rec[x+1]
> >
> >      pxor        m1, m7, m4
> > @@ -69,7 +74,7 @@
> >      pxor        m0, m0
> >      palignr     m0, m2, 15
> >      paddb       m2, m3
> > -    paddb       m2, [pb_2]                  ; m1 = uiEdgeType
> > +    paddb       m2, [pb_2]                  ; m2 = uiEdgeType
> >      pshufb      m3, m6, m2
> >      pmovzxbw    m2, m7                      ; rec
> >      punpckhbw   m7, m5
> > @@ -84,6 +89,43 @@
> >      add         r0q, 16
> >      sub         r2d, 16
> >      jnz        .loop
> > +
> > +    movzx       r3d, byte [r3]
> > +    neg         r3b
> > +    movd        m0, r3d
> > +.loopH:
> > +    movu        m7, [r5]                    ; m7 = rec[x]
> > +    movu        m2, [r5 + 1]                ; m2 = rec[x+1]
> > +
> > +    pxor        m1, m7, m4
> > +    pxor        m3, m2, m4
> > +    pcmpgtb     m2, m1, m3
> > +    pcmpgtb     m3, m1
> > +    pand        m2, [pb_1]
> > +    por         m2, m3
> > +
> > +    pslldq      m3, m2, 1
> > +    por         m3, m0
> > +
> > +    psignb      m3, m4                      ; m3 = signLeft
> > +    pxor        m0, m0
> > +    palignr     m0, m2, 15
> > +    paddb       m2, m3
> > +    paddb       m2, [pb_2]                  ; m2 = uiEdgeType
> > +    pshufb      m3, m6, m2
> > +    pmovzxbw    m2, m7                      ; rec
> > +    punpckhbw   m7, m5
> > +    pmovsxbw    m1, m3                      ; offsetEo
> > +    punpckhbw   m3, m3
> > +    psraw       m3, 8
> > +    paddw       m2, m1
> > +    paddw       m7, m3
> > +    packuswb    m2, m7
> > +    movu        [r5], m2
> > +
> > +    add         r5q, 16
> > +    sub         r4d, 16
> > +    jnz        .loopH
> >      RET
> >
> >
> ;==================================================================================================
> > diff -r ac85c775620f -r f718abdc8004 source/common/x86/loopfilter.h
> > --- a/source/common/x86/loopfilter.h    Tue Mar 31 20:04:28 2015 -0500
> > +++ b/source/common/x86/loopfilter.h    Wed Apr 01 19:05:36 2015 +0530
> > @@ -25,7 +25,7 @@
> >  #ifndef X265_LOOPFILTER_H
> >  #define X265_LOOPFILTER_H
> >
> > -void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX,
> int8_t signLeft);
> > +void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX,
> int8_t* signLeft, intptr_t stride);
> >  void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t*
> offsetEo, intptr_t stride, int width);
> >  void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1,
> int8_t* offsetEo, int lcuWidth, intptr_t stride);
> >  void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t
> *m_offsetEo, intptr_t stride, int startX, int endX);
> > diff -r ac85c775620f -r f718abdc8004 source/encoder/sao.cpp
> > --- a/source/encoder/sao.cpp    Tue Mar 31 20:04:28 2015 -0500
> > +++ b/source/encoder/sao.cpp    Wed Apr 01 19:05:36 2015 +0530
> > @@ -258,7 +258,7 @@
> >      pixel* tmpL;
> >      pixel* tmpU;
> >
> > -    int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
> > +    int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1,
> signLeft[2];
> >      int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
> >
> >      memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid
> valgrind uninit warnings */
> > @@ -279,7 +279,7 @@
> >      {
> >      case SAO_EO_0: // dir: -
> >      {
> > -        pixel firstPxl = 0, lastPxl = 0;
> > +        pixel firstPxl = 0, lastPxl = 0, row1FirstPxl = 0, row1LastPxl
> = 0;
> >          startX = !lpelx;
> >          endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
> >          if (ctuWidth & 15)
> > @@ -301,25 +301,38 @@
> >          }
> >          else
> >          {
> > -            for (y = 0; y < ctuHeight; y++)
> > +            for (y = 0; y < ctuHeight; y += 2)
> >              {
> > -                int signLeft = signOf(rec[startX] - tmpL[y]);
> > -
> > -                if (!lpelx)
> > -                    firstPxl = rec[0];
> > -
> > -                if (rpelx == picWidth)
> > -                    lastPxl = rec[ctuWidth - 1];
> > -
> > -                primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth,
> (int8_t)signLeft);
> > -
> > -                if (!lpelx)
> > -                    rec[0] = firstPxl;
> > -
> > -                if (rpelx == picWidth)
> > -                    rec[ctuWidth - 1] = lastPxl;
> > -
> > -                rec += stride;
> > +                signLeft[0] = signOf(rec[startX] - tmpL[y]);
> > +                signLeft[1] = signOf(rec[stride + startX] - tmpL[y +
> 1]);
> > +
> > +                if (!lpelx)
> > +                {
> > +                    firstPxl = rec[0];
> > +                    row1FirstPxl = rec[stride];
> > +                }
> > +
> > +                if (rpelx == picWidth)
> > +                {
> > +                    lastPxl = rec[ctuWidth - 1];
> > +                    row1LastPxl = rec[stride + ctuWidth - 1];
> > +                }
> > +
> > +                primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth,
> signLeft, stride);
> > +
> > +                if (!lpelx)
> > +                {
> > +                    rec[0] = firstPxl;
> > +                    rec[stride] = row1FirstPxl;
> > +                }
> > +
> > +                if (rpelx == picWidth)
> > +                {
> > +                    rec[ctuWidth - 1] = lastPxl;
> > +                    rec[stride + ctuWidth - 1] = row1LastPxl;
> > +                }
> > +
> > +                rec += 2 * stride;
> >              }
> >          }
> >          break;
> > diff -r ac85c775620f -r f718abdc8004 source/test/pixelharness.cpp
> > --- a/source/test/pixelharness.cpp      Tue Mar 31 20:04:28 2015 -0500
> > +++ b/source/test/pixelharness.cpp      Wed Apr 01 19:05:36 2015 +0530
> > @@ -908,12 +908,10 @@
> >      for (int i = 0; i < ITERS; i++)
> >      {
> >          int width = 16 * (rand() % 4 + 1);
> > -        int8_t sign = rand() % 3;
> > -        if (sign == 2)
> > -            sign = -1;
> > -
> > -        ref(ref_dest, psbuf1 + j, width, sign);
> > -        checked(opt, opt_dest, psbuf1 + j, width, sign);
> > +        int stride = width + 1;
> > +
> > +        ref(ref_dest, psbuf1 + j, width, psbuf2 + j, stride);
> > +        checked(opt, opt_dest, psbuf1 + j, width, psbuf5 + j, stride);
> >
> >          if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
> >              return false;
> > @@ -2058,7 +2056,7 @@
> >      if (opt.saoCuOrgE0)
> >      {
> >          HEADER0("SAO_EO_0");
> > -        REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1,
> 64, 1);
> > +        REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1,
> 64, psbuf2, 64);
> >      }
> >
> >      if (opt.saoCuOrgE1)
> > _______________________________________________
> > x265-devel mailing list
> > x265-devel at videolan.org
> > https://mailman.videolan.org/listinfo/x265-devel
>
>
>
> --
> Steve Borho
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150402/eca65c0d/attachment-0001.html>


More information about the x265-devel mailing list