[x265] [PATCH] sao: only load/save entropy state of sao whenapplying sao

Ximing Cheng chengximing1989 at foxmail.com
Mon Nov 20 18:20:43 CET 2017


A generic function copyFrom() will copy MAX_OFF_CTX_MOD bytes values, which MAX_OFF_CTX_MOD == 157, but SAO only need two bytes, 
load function in release mode, we can get its disassembly code of memcpy:


void Entropy::copyFrom(const Entropy& src)
{
    X265_CHECK(src.m_valid, "invalid copy source context\n");


    copyState(src);
000000013F4334D0  mov         eax,dword ptr [rdx+0B0h]  
000000013F4334D6  mov         dword ptr [rcx+0B0h],eax  
000000013F4334DC  mov         eax,dword ptr [rdx+0B4h]  
000000013F4334E2  mov         dword ptr [rcx+0B4h],eax  
000000013F4334E8  mov         eax,dword ptr [rdx+0C0h]  
000000013F4334EE  mov         dword ptr [rcx+0C0h],eax  
000000013F4334F4  mov         eax,dword ptr [rdx+0B8h]  
000000013F4334FA  mov         dword ptr [rcx+0B8h],eax  
000000013F433500  mov         eax,dword ptr [rdx+0BCh]  
000000013F433506  mov         dword ptr [rcx+0BCh],eax  
000000013F43350C  mov         rax,qword ptr [rdx+0C8h]  
000000013F433513  mov         qword ptr [rcx+0C8h],rax  


    memcpy(m_contextState, src.m_contextState, MAX_OFF_CTX_MOD * sizeof(uint8_t));
000000013F43351A  movups      xmm0,xmmword ptr [rdx+10h]  
000000013F43351E  movups      xmmword ptr [rcx+10h],xmm0  
000000013F433522  movups      xmm1,xmmword ptr [rdx+20h]  
000000013F433526  movups      xmmword ptr [rcx+20h],xmm1  
000000013F43352A  movups      xmm0,xmmword ptr [rdx+30h]  
000000013F43352E  movups      xmmword ptr [rcx+30h],xmm0  
000000013F433532  movups      xmm1,xmmword ptr [rdx+40h]  
000000013F433536  movups      xmmword ptr [rcx+40h],xmm1  
000000013F43353A  movups      xmm0,xmmword ptr [rdx+50h]  
000000013F43353E  movups      xmmword ptr [rcx+50h],xmm0  
000000013F433542  movups      xmm1,xmmword ptr [rdx+60h]  
000000013F433546  movups      xmmword ptr [rcx+60h],xmm1  
000000013F43354A  movups      xmm0,xmmword ptr [rdx+70h]  
000000013F43354E  movups      xmmword ptr [rcx+70h],xmm0  
000000013F433552  movups      xmm1,xmmword ptr [rdx+80h]  
000000013F433559  movups      xmmword ptr [rcx+80h],xmm1  
000000013F433560  movups      xmm0,xmmword ptr [rdx+90h]  
000000013F433567  movups      xmmword ptr [rcx+90h],xmm0  
000000013F43356E  movsd       xmm1,mmword ptr [rdx+0A0h]  
000000013F433576  movsd       mmword ptr [rcx+0A0h],xmm1  
000000013F43357E  mov         eax,dword ptr [rdx+0A8h]  
000000013F433584  mov         dword ptr [rcx+0A8h],eax  
000000013F43358A  movzx       eax,byte ptr [rdx+0ACh]  
000000013F433591  mov         byte ptr [rcx+0ACh],al  
    markValid();
}



We can see that, VC++ compiler will get a lot of movups to copy data. We can also get loadSAO disassembly code:


void Entropy::copyFromSAO(const Entropy& src)
{
    X265_CHECK(src.m_valid, "invalid copy source context\n");


    copyState(src);
000000013F4335A0  mov         eax,dword ptr [rdx+0B0h]  
000000013F4335A6  mov         dword ptr [rcx+0B0h],eax  
000000013F4335AC  mov         eax,dword ptr [rdx+0B4h]  
000000013F4335B2  mov         dword ptr [rcx+0B4h],eax  
000000013F4335B8  mov         eax,dword ptr [rdx+0C0h]  
000000013F4335BE  mov         dword ptr [rcx+0C0h],eax  
000000013F4335C4  mov         eax,dword ptr [rdx+0B8h]  
000000013F4335CA  mov         dword ptr [rcx+0B8h],eax  
000000013F4335D0  mov         eax,dword ptr [rdx+0BCh]  
000000013F4335D6  mov         dword ptr [rcx+0BCh],eax  
000000013F4335DC  mov         rax,qword ptr [rdx+0C8h]  
000000013F4335E3  mov         qword ptr [rcx+0C8h],rax  


    // only copy OFF_SAO_MERGE_FLAG_CTX and OFF_SAO_TYPE_IDX_CTX, two bytes
    uint16_t* srcEntropy = (uint16_t*)(src.m_contextState + OFF_SAO_MERGE_FLAG_CTX);
    uint16_t* dstEntropy = (uint16_t*)(m_contextState + OFF_SAO_MERGE_FLAG_CTX);
    *dstEntropy = *srcEntropy;
000000013F4335EA  movzx       eax,word ptr [rdx+0A8h]  
000000013F4335F1  mov         word ptr [rcx+0A8h],ax  
    markValid();
}



We can see that only one movzx and one mov instruction as only copy two bytes.


------------------ Original ------------------
From:  "Ashok Kumar Mishra";<ashok at multicorewareinc.com>;
Date:  Mon, Nov 20, 2017 05:27 PM
To:  "Development for x265"<x265-devel at videolan.org>;

Subject:  Re: [x265] [PATCH] sao: only load/save entropy state of sao whenapplying sao





On Sat, Nov 18, 2017 at 6:25 PM, Ximing Cheng <chengximing1989 at foxmail.com> wrote:
# HG changeset patch
 # User Ximing Cheng <ximingcheng at tencent.com>
 # Date 1511009612 -28800
 #      Sat Nov 18 20:53:32 2017 +0800
 # Node ID a5b805430a4b0cb797604ab7f0176538d93e8d9d
 # Parent  06979c0423504a324ea05ca3de59769c6d0fba0d
 sao: only load/save entropy state of sao when applying sao
 
 diff -r 06979c042350 -r a5b805430a4b source/encoder/entropy.cpp
 --- a/source/encoder/entropy.cpp        Thu Nov 16 20:23:14 2017 +0530
 +++ b/source/encoder/entropy.cpp        Sat Nov 18 20:53:32 2017 +0800
 @@ -1506,6 +1506,19 @@
      markValid();
  }
 
We use a generic function copyFrom() for loading and storing the context during analysis. copyFrom() use memcopy(), so it is faster.
It is not much beneficial for writing a separate function for SAO, though only two bytes are changing. 
  
 +void Entropy::copyFromSAO(const Entropy& src)
 +{
 +    X265_CHECK(src.m_valid, "invalid copy source context\n");
 +
 +    copyState(src);
 +
 +    // only copy OFF_SAO_MERGE_FLAG_CTX and OFF_SAO_TYPE_IDX_CTX, two bytes
 +    uint16_t* srcEntropy = (uint16_t*)(src.m_contextState + OFF_SAO_MERGE_FLAG_CTX);
 +    uint16_t* dstEntropy = (uint16_t*)(m_contextState + OFF_SAO_MERGE_FLAG_CTX);
 +    *dstEntropy = *srcEntropy;
 +    markValid();
 +}
 +
 
  void Entropy::codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth)
  {
      PartSize partSize = (PartSize)cu.m_partSize[absPartIdx];
 diff -r 06979c042350 -r a5b805430a4b source/encoder/entropy.h
 --- a/source/encoder/entropy.h  Thu Nov 16 20:23:14 2017 +0530
 +++ b/source/encoder/entropy.h  Sat Nov 18 20:53:32 2017 +0800
 @@ -136,7 +136,9 @@
 
      // SBAC RD
      void load(const Entropy& src)            { copyFrom(src); }
 +    void loadSAO(const Entropy& src)         { copyFromSAO(src); }
      void store(Entropy& dest) const          { dest.copyFrom(*this); }
 +    void storeSAO(Entropy& dest) const       { dest.copyFromSAO(*this); }
      void loadContexts(const Entropy& src)    { copyContextsFrom(src); }
      void loadIntraDirModeLuma(const Entropy& src);
      void copyState(const Entropy& other);
 @@ -254,6 +256,7 @@
                           bool& bCodeDQP, const uint32_t depthRange[2]);
 
      void copyFrom(const Entropy& src);
 +    void copyFromSAO(const Entropy& src);
      void copyContextsFrom(const Entropy& src);
  };
  }
 diff -r 06979c042350 -r a5b805430a4b source/encoder/sao.cpp
 --- a/source/encoder/sao.cpp    Thu Nov 16 20:23:14 2017 +0530
 +++ b/source/encoder/sao.cpp    Sat Nov 18 20:53:32 2017 +0800
 @@ -242,9 +242,9 @@
          break;
      }
 
 -    m_entropyCoder.load(initState);
 -    m_rdContexts.next.load(initState);
 -    m_rdContexts.cur.load(initState);
 +    m_entropyCoder.loadSAO(initState);
 +    m_rdContexts.next.loadSAO(initState);
 +    m_rdContexts.cur.loadSAO(initState);
 
      SAOParam* saoParam = frame->m_encData->m_saoParam;
      if (!saoParam)
 @@ -1262,13 +1262,13 @@
      for (int i = 0; i < planes; i++)
          saoParam->ctuParam[i][addr].reset();
      // SAO distortion calculation
 -    m_entropyCoder.load(m_rdContexts.cur);
 +    m_entropyCoder.loadSAO(m_rdContexts.cur);
      m_entropyCoder.resetBits();
      if (allowMerge[0])
          m_entropyCoder.codeSaoMerge(0);
      if (allowMerge[1])
          m_entropyCoder.codeSaoMerge(0);
 -    m_entropyCoder.store(m_rdContexts.temp);
 +    m_entropyCoder.storeSAO(m_rdContexts.temp);
      memset(m_offset, 0, sizeof(m_offset));
      int64_t bestCost = 0;
      int64_t rateDist = 0;
 @@ -1333,7 +1333,7 @@
                  mergeDist += (estDist << 8) / lambda[!!plane];
              }
 
 -            m_entropyCoder.load(m_rdContexts.cur);
 +            m_entropyCoder.loadSAO(m_rdContexts.cur);
              m_entropyCoder.resetBits();
              if (allowMerge[0])
                  m_entropyCoder.codeSaoMerge(1 - mergeIdx);
 @@ -1346,7 +1346,7 @@
              {
                  SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
                  bestCost = mergeCost;
 -                m_entropyCoder.store(m_rdContexts.temp);
 +                m_entropyCoder.storeSAO(m_rdContexts.temp);
                  for (int plane = 0; plane < planes; plane++)
                  {
                      if (saoParam->bSaoFlag[plane > 0])
 @@ -1368,8 +1368,8 @@
              m_numNoSao[0]++;
          if (chroma && saoParam->ctuParam[1][addr].typeIdx < 0)
              m_numNoSao[1]++;
 -        m_entropyCoder.load(m_rdContexts.temp);
 -        m_entropyCoder.store(m_rdContexts.cur);
 +        m_entropyCoder.loadSAO(m_rdContexts.temp);
 +        m_entropyCoder.storeSAO(m_rdContexts.cur);
      }
  }
 
 @@ -1488,7 +1488,7 @@
      int64_t costClasses[MAX_NUM_SAO_CLASS];
 
      // RDO SAO_NA
 -    m_entropyCoder.load(m_rdContexts.temp);
 +    m_entropyCoder.loadSAO(m_rdContexts.temp);
      m_entropyCoder.resetBits();
      m_entropyCoder.codeSaoType(0);
      int64_t costPartBest = calcSaoRdoCost(0, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
 @@ -1518,7 +1518,7 @@
              estDist += distClasses[classIdx];
          }
 
 -        m_entropyCoder.load(m_rdContexts.temp);
 +        m_entropyCoder.loadSAO(m_rdContexts.temp);
          m_entropyCoder.resetBits();
          m_entropyCoder.codeSaoOffsetEO(m_offset[0][typeIdx] + 1, typeIdx, 0);
 
 @@ -1576,7 +1576,7 @@
      for (int classIdx = bestClassBO; classIdx < bestClassBO + SAO_NUM_OFFSET; classIdx++)
          estDist += distClasses[classIdx];
 
 -    m_entropyCoder.load(m_rdContexts.temp);
 +    m_entropyCoder.loadSAO(m_rdContexts.temp);
      m_entropyCoder.resetBits();
      m_entropyCoder.codeSaoOffsetBO(m_offset[0][SAO_BO] + bestClassBO, bestClassBO, 0);
 
 @@ -1595,9 +1595,9 @@
      }
 
      rateDist = (bestDist << 8) / lambda[0];
 -    m_entropyCoder.load(m_rdContexts.temp);
 +    m_entropyCoder.loadSAO(m_rdContexts.temp);
      m_entropyCoder.codeSaoOffset(*lclCtuParam, 0);
 -    m_entropyCoder.store(m_rdContexts.temp);
 +    m_entropyCoder.storeSAO(m_rdContexts.temp);
 
      if (m_param->internalCsp == X265_CSP_I400)
      {
 @@ -1616,7 +1616,7 @@
      int32_t distClasses[MAX_NUM_SAO_CLASS];
      int32_t bestClassBO[2] = { 0, 0 };
 
 -    m_entropyCoder.load(m_rdContexts.temp);
 +    m_entropyCoder.loadSAO(m_rdContexts.temp);
      m_entropyCoder.resetBits();
      m_entropyCoder.codeSaoType(0);
 
 @@ -1651,7 +1651,7 @@
              }
          }
 
 -        m_entropyCoder.load(m_rdContexts.temp);
 +        m_entropyCoder.loadSAO(m_rdContexts.temp);
          m_entropyCoder.resetBits();
 
          for (int compIdx = 0; compIdx < 2; compIdx++)
 @@ -1715,7 +1715,7 @@
              estDist[compIdx - 1] += distClasses[classIdx];
      }
 
 -    m_entropyCoder.load(m_rdContexts.temp);
 +    m_entropyCoder.loadSAO(m_rdContexts.temp);
      m_entropyCoder.resetBits();
 
      for (int compIdx = 0; compIdx < 2; compIdx++)
 @@ -1740,13 +1740,13 @@
      }
 
      rateDist += (bestDist << 8) / lambda[1];
 -    m_entropyCoder.load(m_rdContexts.temp);
 +    m_entropyCoder.loadSAO(m_rdContexts.temp);
 
      if (saoParam->bSaoFlag[1])
      {
          m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1);
          m_entropyCoder.codeSaoOffset(*lclCtuParam[1], 2);
 -        m_entropyCoder.store(m_rdContexts.temp);
 +        m_entropyCoder.storeSAO(m_rdContexts.temp);
 
          uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
          bestCost = rateDist + rate;
 
 
 _______________________________________________
 x265-devel mailing list
 x265-devel at videolan.org
 https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20171121/b34e4d0d/attachment-0001.html>


More information about the x265-devel mailing list