[x265] [PATCH] asm: split SAO_EO_0 into separate primitive func, added assembly code and testbench support

Thu Feb 27 21:06:42 CET 2014

On Thu, Feb 27, 2014 at 7:00 AM,  <dnyaneshwar at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
> # Date 1393505529 -19800
> #      Thu Feb 27 18:22:09 2014 +0530
> # Node ID 33cab8f8f6c25cb5a16b2aee8d26a65f91bc156e
> # Parent  c9a0802b64aca46509b55d134810cd1b87cd929b
> asm: split SAO_EO_0 into separate primitive func, added assembly code and testbench support
> added loopfilter.cpp, loopfilter.h, loopfilter.asm files for C and assembly code
>
> diff -r c9a0802b64ac -r 33cab8f8f6c2 source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp
> --- a/source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp        Wed Feb 26 22:16:28 2014 -0600
> +++ b/source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp        Thu Feb 27 18:22:09 2014 +0530
> @@ -45,6 +45,8 @@
>  //! \ingroup TLibCommon
>  //! \{
>
> +int g_bitDepthY = 8;

Uhm.. this is not good.  It would break 10bit encodes.  Use X265_DEPTH
anywhere you need pixel bit depth.

> +
>  SAOParam::~SAOParam()
>  {
>      for (int i = 0; i < 3; i++)
> @@ -535,8 +537,6 @@
>      uint32_t tpely     = tmpCu->getCUPelY();
>      uint32_t rpelx;
>      uint32_t bpely;
> -    int  signLeft;
> -    int  signRight;
>      int  signDown;
>      int  signDown1;
>      int  signDown2;
> @@ -614,23 +614,60 @@
>      {
>      case SAO_EO_0: // dir: -
>      {
> -        startX = (lpelx == 0) ? 1 : 0;
> -        endX   = (rpelx == picWidthTmp) ? lcuWidth - 1 : lcuWidth;
> -        for (y = 0; y < lcuHeight; y++)
> -        {
> -            signLeft = xSign(rec[startX] - tmpL[y]);
> -            for (x = startX; x < endX; x++)
> -            {
> -                signRight =  xSign(rec[x] - rec[x + 1]);
> -                edgeType =  signRight + signLeft + 2;
> -                signLeft  = -signRight;
> +      pixel firstPxl = 0, lastPxl = 0;
>
> -                rec[x] = clipTbl[rec[x] + m_offsetEo[edgeType]];
> -            }
> +      startX = (lpelx == 0) ? 1 : 0;
> +      endX   = (rpelx == picWidthTmp) ? lcuWidth-1 : lcuWidth;
>
> -            rec += stride;
> -        }
> +      if (lcuWidth % 16)
> +      {
> +          int8_t iSignRight;
> +          int8_t uiEdgeType;

no hungarian prefixes.  in fact, this whole file probably needs to be
scrubbed of them before making any new changes to it.

>
> +          for (y = 0; y < lcuHeight; y++)
> +          {
> +              int8_t iSignLeft = xSign(rec[startX] - tmpL[y]);
> +              for (x = startX; x < endX; x++)
> +              {
> +                  iSignRight = xSign(rec[x] - rec[x+1]);
> +                  uiEdgeType = iSignRight + iSignLeft + 2;
> +                  iSignLeft  = -iSignRight;
> +
> +                  rec[x] =  Clip3(0, (1 << g_bitDepthY) - 1, rec[x] + m_offsetEo[uiEdgeType]);
> +              }
> +              rec += stride;
> +          }
> +      }
> +      else
> +      {
> +          for (y = 0; y < lcuHeight; y++)
> +          {
> +              int8_t iSignLeft = xSign(rec[startX] - tmpL[y]);
> +
> +              if (lpelx == 0)
> +              {
> +                  firstPxl = rec[0];
> +              }
> +
> +              if (rpelx == picWidthTmp)
> +              {
> +                  lastPxl = rec[lcuWidth - 1];
> +              }
> +
> +              primitives.processSaoCuOrg_8bit_SAO_EO_0(rec, m_offsetEo, lcuWidth, iSignLeft);
> +
> +              if (lpelx == 0)
> +              {
> +                  rec[0] = firstPxl;
> +              }
> +
> +              if (rpelx == picWidthTmp)
> +              {
> +                  rec[lcuWidth - 1] = lastPxl;
> +              }
> +              rec += stride;
> +          }
> +      }
>          break;
>      }
>      case SAO_EO_1: // dir: |
> diff -r c9a0802b64ac -r 33cab8f8f6c2 source/Lib/TLibCommon/TComSampleAdaptiveOffset.h
> --- a/source/Lib/TLibCommon/TComSampleAdaptiveOffset.h  Wed Feb 26 22:16:28 2014 -0600
> +++ b/source/Lib/TLibCommon/TComSampleAdaptiveOffset.h  Thu Feb 27 18:22:09 2014 +0530
> @@ -146,7 +146,7 @@
>
>      int32_t *m_offsetBo;
>      int32_t *m_chromaOffsetBo;
> -    int m_offsetEo[LUMA_GROUP_NUM];
> +    int8_t m_offsetEo[LUMA_GROUP_NUM];
>
>      int  m_picWidth;
>      int  m_picHeight;
> diff -r c9a0802b64ac -r 33cab8f8f6c2 source/Lib/TLibCommon/loopfilter.cpp
> --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
> +++ b/source/Lib/TLibCommon/loopfilter.cpp      Thu Feb 27 18:22:09 2014 +0530
> @@ -0,0 +1,51 @@
> +/*****************************************************************************
> +* Copyright (C) 2013 x265 project
> +*
> +* Authors: Praveen Kumar Tiwari <praveen at multicorewareinc.com>
> +*          Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
> +* This program is free software; you can redistribute it and/or modify
> +* it under the terms of the GNU General Public License as published by
> +* the Free Software Foundation; either version 2 of the License, or
> +* (at your option) any later version.
> +*
> +* This program is distributed in the hope that it will be useful,
> +* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +* GNU General Public License for more details.
> +*
> +* You should have received a copy of the GNU General Public License
> +* along with this program; if not, write to the Free Software
> +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
> +*
> +* This program is also available under a commercial proprietary license.
> +* For more information, contact us at licensing at multicorewareinc.com.
> +*****************************************************************************/
> +
> +#include "primitives.h"
> +
> +#define PIXEL_MIN 0
> +#define PIXEL_MAX ((1 << 8) - 1)

this C ref will not work with 10bit pixels?

> +
> +void SAO_EO_0_C(pixel * pRec, int8_t * m_iOffsetEo, int iLcuWidth, int8_t iSignLeft)

lower case or camel case function names, no hungarians

> +{
> +    int x;
> +    int8_t iSignRight;
> +    int8_t uiEdgeType;
> +
> +    for (x = 0; x < iLcuWidth; x++)
> +    {
> +        iSignRight = ((pRec[x] - pRec[x+1]) < 0) ? -1 : ((pRec[x] - pRec[x+1]) > 0) ? 1 : 0;
> +        uiEdgeType = iSignRight + iSignLeft + 2;
> +        iSignLeft  = -iSignRight;
> +
> +        short v = pRec[x] + m_iOffsetEo[uiEdgeType];
> +        pRec[x] = (v < 0 ? 0 : (v > (PIXEL_MAX)) ? (PIXEL_MAX) : v);
> +    }
> +}
> +
> +namespace x265 {
> +void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p)
> +{
> +    p.processSaoCuOrg_8bit_SAO_EO_0 = SAO_EO_0_C;
> +}
> +}
> diff -r c9a0802b64ac -r 33cab8f8f6c2 source/common/CMakeLists.txt
> --- a/source/common/CMakeLists.txt      Wed Feb 26 22:16:28 2014 -0600
> +++ b/source/common/CMakeLists.txt      Thu Feb 27 18:22:09 2014 +0530
> @@ -38,7 +38,8 @@
>      ../Lib/TLibCommon/TComSlice.cpp
>      ../Lib/TLibCommon/TComTrQuant.cpp
>      ../Lib/TLibCommon/TComWeightPrediction.cpp
> -    ../Lib/TLibCommon/TComYuv.cpp)
> +    ../Lib/TLibCommon/TComYuv.cpp
> +    ../Lib/TLibCommon/loopfilter.cpp)

new files do not go under TLibCommon, those go in common/ with the other c refs.

>  source_group(TLibCommon FILES ${LIBCOMMON_SRC})
>  source_group(TLibCommonH FILES ${LIBCOMMON_HDR})
>
> @@ -102,14 +103,14 @@
>
>  if(ENABLE_ASSEMBLY)
>      set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
> -    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h)
> +    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
>      set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm
>                 mc-a2.asm pixel-util8.asm blockcopy8.asm
>                 pixeladd8.asm dct8.asm)
>      if(HIGH_BIT_DEPTH)
>          set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm)
>      else()
> -        set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm ipfilter8.asm)
> +        set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm ipfilter8.asm loopfilter.asm)
>      endif()
>
>      if(NOT X64)
> diff -r c9a0802b64ac -r 33cab8f8f6c2 source/common/primitives.cpp
> --- a/source/common/primitives.cpp      Wed Feb 26 22:16:28 2014 -0600
> +++ b/source/common/primitives.cpp      Thu Feb 27 18:22:09 2014 +0530
> @@ -63,6 +63,7 @@
>  void Setup_C_DCTPrimitives(EncoderPrimitives &p);
>  void Setup_C_IPFilterPrimitives(EncoderPrimitives &p);
>  void Setup_C_IPredPrimitives(EncoderPrimitives &p);
> +void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p);
>
>  void Setup_C_Primitives(EncoderPrimitives &p)
>  {
> @@ -70,6 +71,7 @@
>      Setup_C_DCTPrimitives(p);        // dct.cpp
>      Setup_C_IPFilterPrimitives(p);   // ipfilter.cpp
>      Setup_C_IPredPrimitives(p);      // intrapred.cpp
> +    Setup_C_LoopFilterPrimitives(p); // loopfilter.cpp
>  }
>  }
>
> diff -r c9a0802b64ac -r 33cab8f8f6c2 source/common/primitives.h
> --- a/source/common/primitives.h        Wed Feb 26 22:16:28 2014 -0600
> +++ b/source/common/primitives.h        Thu Feb 27 18:22:09 2014 +0530
> @@ -188,6 +188,8 @@
>
>  typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
>
> +typedef void (*processSaoCuOrg_8bit_t)(pixel * pRec, int8_t * m_iOffsetEo, int iLcuWidth, int8_t iSignLeft);
> +
>  /* Define a structure containing function pointers to optimized encoder
>   * primitives.  Each pointer can reference either an assembly routine,
>   * a vectorized primitive, or a C function. */
> @@ -255,6 +257,9 @@
>      plane_copy_deinterleave_t plane_copy_deinterleave_c;
>      extendCURowBorder_t extendRowBorder;
>
> +    // sao primitives
> +    processSaoCuOrg_8bit_t      processSaoCuOrg_8bit_SAO_EO_0;

8bit seems unnecessary here, and both the type and function names are
overly long and repetitive.

Something much simpler would be preferable, like:  sao_cuorg_t processSaoCUOrgE0

> +
>      struct
>      {
>          filter_pp_t     filter_vpp[NUM_LUMA_PARTITIONS];
> diff -r c9a0802b64ac -r 33cab8f8f6c2 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Wed Feb 26 22:16:28 2014 -0600
> +++ b/source/common/x86/asm-primitives.cpp      Thu Feb 27 18:22:09 2014 +0530
> @@ -32,6 +32,7 @@
>  #include "pixel-util.h"
>  #include "mc.h"
>  #include "ipfilter8.h"
> +#include "loopfilter.h"
>  #include "blockcopy8.h"
>  #include "intrapred.h"
>  #include "dct8.h"
> @@ -1258,6 +1259,8 @@
>      }
>      if (cpuMask & X265_CPU_SSE4)
>      {
> +        p.processSaoCuOrg_8bit_SAO_EO_0 = x265_SAO_EO_0_sse4;
> +
>          LUMA_ADDAVG(_sse4);
>          CHROMA_ADDAVG(_sse4);
>          p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
> diff -r c9a0802b64ac -r 33cab8f8f6c2 source/common/x86/loopfilter.asm
> --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
> +++ b/source/common/x86/loopfilter.asm  Thu Feb 27 18:22:09 2014 +0530
> @@ -0,0 +1,81 @@
> +;*****************************************************************************
> +;* Copyright (C) 2013 x265 project
> +;*
> +;* Authors: Praveen Kumar Tiwari <praveen at multicorewareinc.com>

should this file have Praveen's name on it?

> +;*
> +;* This program is free software; you can redistribute it and/or modify
> +;* it under the terms of the GNU General Public License as published by
> +;* the Free Software Foundation; either version 2 of the License, or
> +;* (at your option) any later version.
> +;*
> +;* This program is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +;* GNU General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU General Public License
> +;* along with this program; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
> +;*
> +;* This program is also available under a commercial proprietary license.
> +;* For more information, contact us at licensing at multicorewareinc.com.
> +;*****************************************************************************/
> +
> +%include "x86inc.asm"
> +
> +SECTION_RODATA 32
> +
> +pw_2:    times 16 db  2
> +
> +SECTION .text
> +
> +;============================================================================================================
> +; void SAO_EO_0(Pxl * pRec, int8_t * m_iOffsetEo, Int iLcuWidth, int8_t iSignLeft)
> +;============================================================================================================
> +INIT_XMM sse4
> +cglobal SAO_EO_0, 4, 4, 8, pRec, m_iOffsetEo, iLcuWidth, iSignLeft
> +
> +    neg         r3                 ; r3 = -iSignLeft
> +    movd        m0,    r3d
> +    pslldq      m0,    15          ; m0 = [iSignLeft x .. x]
> +    pcmpeqb     m4,    m4          ; m4 = [pb -1]
> +    pxor        m5,    m5          ; m5 = 0
> +    movu        m6,    [r1]        ; m6 = m_iOffsetEo
> +
> +.loop:
> +    movu        m7,    [r0]                    ; m1 = pRec[x]
> +    mova        m1,    m7
> +    movu        m2,    [r0+1]                  ; m2 = pRec[x+1]
> +
> +    psubusb     m3,    m2,    m7
> +    psubusb     m1,    m2
> +    pcmpeqb     m3,    m5
> +    pcmpeqb     m1,    m5
> +    pcmpeqb     m2,    m7
> +
> +    pabsb       m3,    m3                      ; m1 = (pRec[x] - pRec[x+1]) > 0) ?  1 : 0
> +    por         m1,    m3                      ; m1 = iSignRight
> +    pandn       m2, m1
> +
> +    palignr     m3,    m2,        m0,    15    ; m3 = -iSignLeft
> +    psignb      m3,    m4                      ; m3 = iSignLeft
> +    mova        m0, m4
> +    pslldq      m0, 15
> +    pand        m0,    m2                      ; [pb 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1]
> +    paddb       m2,    m3
> +    paddb       m2,    [pw_2]                  ; m1 = uiEdgeType
> +    pshufb      m3,    m6,        m2
> +    pmovzxbw    m2,    m7                      ; rec
> +    punpckhbw   m7,    m5
> +    pmovsxbw    m1,    m3                      ; iOffsetEo
> +    punpckhbw   m3,    m3
> +    psraw       m3,    8
> +    paddw       m2,    m1
> +    paddw       m7,    m3
> +    packuswb    m2,    m7
> +    movu        [r0],  m2
> +
> +    add         r0q,   16
> +    sub         r2d,   16
> +    jnz        .loop
> +    RET
> diff -r c9a0802b64ac -r 33cab8f8f6c2 source/common/x86/loopfilter.h
> --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
> +++ b/source/common/x86/loopfilter.h    Thu Feb 27 18:22:09 2014 +0530
> @@ -0,0 +1,29 @@
> +/*****************************************************************************
> + * Copyright (C) 2013 x265 project
> + *
> + * Authors: Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at licensing at multicorewareinc.com.
> + *****************************************************************************/
> +
> +#ifndef X265_LOOPFILTER_H
> +#define X265_LOOPFILTER_H
> +
> +void x265_SAO_EO_0_sse4(pixel * pRec, int8_t * m_ffsetEo, int iEndX, int8_t iSignLeft);

no hungarian prefixes, the function name doesn't need upper case, and
function arguments should never have m_ prefixes

> +
> +#endif // ifndef X265_LOOPFILTER_H
> diff -r c9a0802b64ac -r 33cab8f8f6c2 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp      Wed Feb 26 22:16:28 2014 -0600
> +++ b/source/test/pixelharness.cpp      Thu Feb 27 18:22:09 2014 +0530
> @@ -50,6 +50,8 @@
>      pbuf3 = X265_MALLOC(pixel, bufsize);
>      pbuf4 = X265_MALLOC(pixel, bufsize);
>
> +    psbuf1 = (int8_t*)X265_MALLOC(int8_t, bufsize);

X265_MALLOC returns a typed pointer, casts are unnecessary

> +
>      ibuf1 = X265_MALLOC(int, bufsize);
>
>      sbuf1 = X265_MALLOC(int16_t, bufsize);
> @@ -63,7 +65,7 @@
>      short_test_buff2 = X265_MALLOC(int16_t*, TEST_CASES);
>      int_test_buff    = X265_MALLOC(int*, TEST_CASES);
>      if (!pbuf1 || !pbuf2 || !pbuf3 || !pbuf4 || !sbuf1 || !sbuf2 || !sbuf3 || !ibuf1 ||
> -        !pixel_test_buff || !short_test_buff || !int_test_buff || !short_test_buff1 || !short_test_buff2)
> +        !pixel_test_buff || !short_test_buff || !int_test_buff || !short_test_buff1 || !short_test_buff2 || !psbuf1)

wrap this line again

>      {
>          fprintf(stderr, "malloc failed, unable to initiate tests!\n");
>          exit(1);
> @@ -114,6 +116,7 @@
>          sbuf2[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
>          ibuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1;
>
> +        psbuf1[i] = (rand() %65) - 32;

65 is a rather magical value

>          sbuf3[i] = rand() % PIXEL_MAX; // for blockcopy only
>      }
>  }
> @@ -869,6 +872,39 @@
>      return true;
>  }
>
> +bool PixelHarness::check_SAO_EO_0_C_8bit_t(processSaoCuOrg_8bit_t ref, processSaoCuOrg_8bit_t opt)
> +{
> +    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> +    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
> +
> +    int j = 0;
> +
> +    for(int i = 0; i < sizeof(ref_dest); i++)
> +    {
> +        opt_dest[i] = ref_dest[i] = rand() & PIXEL_MAX;
> +    }
> +
> +    int width =  16 * (rand() % 4 + 1);
> +
> +    int8_t sign = rand () % 3;
> +    if (sign == 2)
> +    {
> +      sign = -1;
> +    }
> +
> +    for (int i = 0; i < ITERS; i++)
> +    {
> +      ref(ref_dest, psbuf1 + j, width, sign);
> +      opt(opt_dest, psbuf1 + j, width, sign);
> +
> +        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
> +            return false;
> +
> +        j += INCR;
> +    }
> +
> +    return true;
> +}
>  bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
>  {
>      if (opt.satd[part])
> @@ -1252,6 +1288,15 @@
>          }
>      }
>
> +    if (opt.processSaoCuOrg_8bit_SAO_EO_0)
> +    {
> +      if (!check_SAO_EO_0_C_8bit_t(ref.processSaoCuOrg_8bit_SAO_EO_0, opt.processSaoCuOrg_8bit_SAO_EO_0))
> +      {
> +        printf("SAO_EO_0 failed\n");
> +        return false;
> +      }
> +    }
> +
>      return true;
>  }
>
> @@ -1531,4 +1576,10 @@
>          HEADER0("ssim_end_4");
>          REPORT_SPEEDUP(opt.ssim_end_4, ref.ssim_end_4, (int(*)[4])pbuf2, (int(*)[4])pbuf1, 4);
>      }
> +
> +    if (opt.processSaoCuOrg_8bit_SAO_EO_0)
> +    {
> +        printf("SAO_EO_0");
> +        REPORT_SPEEDUP(opt.processSaoCuOrg_8bit_SAO_EO_0, ref.processSaoCuOrg_8bit_SAO_EO_0, pbuf1, psbuf1, 64, 1);
> +    }
>  }
> diff -r c9a0802b64ac -r 33cab8f8f6c2 source/test/pixelharness.h
> --- a/source/test/pixelharness.h        Wed Feb 26 22:16:28 2014 -0600
> +++ b/source/test/pixelharness.h        Thu Feb 27 18:22:09 2014 +0530
> @@ -32,6 +32,7 @@
>  protected:
>
>      pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4, **pixel_test_buff;
> +    int8_t *psbuf1;

can't this primitive use pixels?

>      int *ibuf1, **int_test_buff;
>      int16_t *sbuf1, *sbuf2, *sbuf3, **short_test_buff, **short_test_buff1, **short_test_buff2;
>      bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
> @@ -62,6 +63,7 @@
>      bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
>      bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
>      bool check_addAvg(addAvg_t, addAvg_t);
> +    bool check_SAO_EO_0_C_8bit_t(processSaoCuOrg_8bit_t ref, processSaoCuOrg_8bit_t opt);

8bit doesn't need to be in the function name, since it's a compile
option, and the trailing _t is also unnecessary.

-- 
Steve Borho