[x265] [PATCH] sao: remove saoCuOrgE3_2Rows function and modify saoCuOrgE3 primitive to handle width=16 seperately

Steve Borho steve at borho.org
Wed Apr 22 18:24:34 CEST 2015


On Wed, Apr 22, 2015 at 1:44 AM, Divya Manivannan
<divya at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Divya Manivannan <divya at multicorewareinc.com>
> # Date 1429684176 -19800
> #      Wed Apr 22 11:59:36 2015 +0530
> # Node ID 584211b333ac9640d81423b3f60a18956425e27c
> # Parent  86268e498680951069c48b681eef830b0aa37873
> sao: remove saoCuOrgE3_2Rows function and modify saoCuOrgE3 primitive to handle width=16 seperately

queued

> diff -r 86268e498680 -r 584211b333ac source/common/loopfilter.cpp
> --- a/source/common/loopfilter.cpp      Wed Apr 22 00:00:39 2015 -0500
> +++ b/source/common/loopfilter.cpp      Wed Apr 22 11:59:36 2015 +0530
> @@ -122,25 +122,6 @@
>      }
>  }
>
> -void processSaoCUE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff)
> -{
> -    int8_t signDown;
> -    int8_t edgeType;
> -
> -    for (int y = 0; y < 2; y++)
> -    {
> -        for (int x = startX + 1; x < endX; x++)
> -        {
> -            signDown = signOf(rec[x] - rec[x + stride]);
> -            edgeType = signDown + upBuff1[x] + 2;
> -            upBuff1[x - 1] = -signDown;
> -            rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
> -        }
> -        upBuff1[endX - 1] = upBuff[y];
> -        rec += stride + 1;
> -    }
> -}
> -
>  void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
>  {
>      #define SAO_BO_BITS 5
> @@ -164,8 +145,8 @@
>      p.saoCuOrgE1 = processSaoCUE1;
>      p.saoCuOrgE1_2Rows = processSaoCUE1_2Rows;
>      p.saoCuOrgE2 = processSaoCUE2;
> -    p.saoCuOrgE3 = processSaoCUE3;
> -    p.saoCuOrgE3_2Rows = processSaoCUE3_2Rows;
> +    p.saoCuOrgE3[0] = processSaoCUE3;
> +    p.saoCuOrgE3[1] = processSaoCUE3;
>      p.saoCuOrgB0 = processSaoCUB0;
>      p.sign = calSign;
>  }
> diff -r 86268e498680 -r 584211b333ac source/common/primitives.h
> --- a/source/common/primitives.h        Wed Apr 22 00:00:39 2015 -0500
> +++ b/source/common/primitives.h        Wed Apr 22 11:59:36 2015 +0530
> @@ -172,7 +172,6 @@
>  typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
>  typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
>  typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
> -typedef void (*saoCuOrgE3_2Rows_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff);
>  typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
>  typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
>  typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
> @@ -278,8 +277,7 @@
>      saoCuOrgE0_t          saoCuOrgE0;
>      saoCuOrgE1_t          saoCuOrgE1, saoCuOrgE1_2Rows;
>      saoCuOrgE2_t          saoCuOrgE2;
> -    saoCuOrgE3_t          saoCuOrgE3;
> -    saoCuOrgE3_2Rows_t    saoCuOrgE3_2Rows;
> +    saoCuOrgE3_t          saoCuOrgE3[2];

please send a patch which documents why there are two versions of this
primitive.

>      saoCuOrgB0_t          saoCuOrgB0;
>
>      downscale_t           frameInitLowres;
> diff -r 86268e498680 -r 584211b333ac source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Wed Apr 22 00:00:39 2015 -0500
> +++ b/source/common/x86/asm-primitives.cpp      Wed Apr 22 11:59:36 2015 +0530
> @@ -1519,8 +1519,8 @@
>          p.saoCuOrgE1 = x265_saoCuOrgE1_sse4;
>          p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_sse4;
>          p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
> -        p.saoCuOrgE3 = x265_saoCuOrgE3_sse4;
> -        p.saoCuOrgE3_2Rows = x265_saoCuOrgE3_2Rows_sse4;
> +        p.saoCuOrgE3[0] = x265_saoCuOrgE3_sse4;
> +        p.saoCuOrgE3[1] = x265_saoCuOrgE3_sse4;
>          p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
>
>          LUMA_ADDAVG(sse4);
> @@ -1728,7 +1728,7 @@
>          p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
>          p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
>          p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
> -        p.saoCuOrgE3 = x265_saoCuOrgE3_avx2;
> +        p.saoCuOrgE3[0] = x265_saoCuOrgE3_avx2;
>          p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
>
>          p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
> diff -r 86268e498680 -r 584211b333ac source/common/x86/loopfilter.asm
> --- a/source/common/x86/loopfilter.asm  Wed Apr 22 00:00:39 2015 -0500
> +++ b/source/common/x86/loopfilter.asm  Wed Apr 22 11:59:36 2015 +0530
> @@ -582,135 +582,6 @@
>      movhps          [r1 + r5 - 1], xm7
>      RET
>
> -;=============================================================================================================================
> -;void saoCuOrgE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff)
> -;=============================================================================================================================
> -INIT_XMM sse4
> -cglobal saoCuOrgE3_2Rows, 3, 7, 8
> -    mov             r3d, r3m
> -    mov             r4d, r4m
> -    movu            m5, [r2]
> -    mov             r2d, r5m
> -    mov             r6,  r6m
> -
> -    movh            m7, [r0 + r2]
> -    movhps          m7, [r1 + r2 - 1]
> -
> -    inc             r4d
> -    add             r0, r4
> -    add             r1, r4
> -
> -    sub             r2d, r4d
> -    pxor            m0, m0                      ; m0 = 0
> -    mova            m6, [pb_2]                  ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
> -
> -.loop:
> -    movu            m1, [r0]                    ; m1 = pRec[x]
> -    movu            m2, [r0 + r3]               ; m2 = pRec[x + iStride]
> -
> -    psubusb         m3, m2, m1
> -    psubusb         m4, m1, m2
> -    pcmpeqb         m3, m0
> -    pcmpeqb         m4, m0
> -    pcmpeqb         m2, m1
> -
> -    pabsb           m3, m3
> -    por             m4, m3
> -    pandn           m2, m4                      ; m2 = iSignDown
> -
> -    movu            m3, [r1]                    ; m3 = m_iUpBuff1
> -
> -    paddb           m3, m2
> -    paddb           m3, m6                      ; m3 = uiEdgeType
> -
> -    pshufb          m4, m5, m3
> -
> -    psubb           m3, m0, m2
> -    movu            [r1 - 1], m3
> -
> -    pmovzxbw        m2, m1
> -    punpckhbw       m1, m0
> -    pmovsxbw        m3, m4
> -    punpckhbw       m4, m4
> -    psraw           m4, 8
> -
> -    paddw           m2, m3
> -    paddw           m1, m4
> -    packuswb        m2, m1
> -    movu            [r0], m2
> -
> -    add             r0, 16
> -    add             r1, 16
> -    sub             r2, 16
> -    jg              .loop
> -
> -    add             r0, r2
> -    add             r1, r2
> -    movh            [r0], m7
> -    movhps          [r1 - 1], m7
> -
> -    mov             r5d, r5m
> -    mov             r2b, byte[r6]
> -    mov             byte[r1 - 1], r2b
> -
> -    sub             r0, r5
> -    lea             r0, [r0 + r3 + 1]
> -
> -    movh            m7, [r0 + r5]
> -    movhps          m7, [r1 - 1]
> -
> -    sub             r1, r5
> -    add             r0, r4
> -    add             r1, r4
> -    sub             r5d, r4d
> -
> -.loop1:
> -    movu            m1, [r0]                    ; m1 = pRec[x]
> -    movu            m2, [r0 + r3]               ; m2 = pRec[x + iStride]
> -
> -    psubusb         m3, m2, m1
> -    psubusb         m4, m1, m2
> -    pcmpeqb         m3, m0
> -    pcmpeqb         m4, m0
> -    pcmpeqb         m2, m1
> -
> -    pabsb           m3, m3
> -    por             m4, m3
> -    pandn           m2, m4                      ; m2 = iSignDown
> -
> -    movu            m3, [r1]                    ; m3 = m_iUpBuff1
> -
> -    paddb           m3, m2
> -    paddb           m3, m6                      ; m3 = uiEdgeType
> -
> -    pshufb          m4, m5, m3
> -
> -    psubb           m3, m0, m2
> -    movu            [r1 - 1], m3
> -
> -    pmovzxbw        m2, m1
> -    punpckhbw       m1, m0
> -    pmovsxbw        m3, m4
> -    punpckhbw       m4, m4
> -    psraw           m4, 8
> -
> -    paddw           m2, m3
> -    paddw           m1, m4
> -    packuswb        m2, m1
> -    movu            [r0], m2
> -
> -    add             r0, 16
> -    add             r1, 16
> -    sub             r5, 16
> -    jg              .loop1
> -
> -    movh            [r0 + r5], m7
> -    movhps          [r1 + r5 - 1], m7
> -
> -    mov             r2b, byte[r6 + 1]
> -    mov             byte[r1 + r5 - 1], r2b
> -    RET
> -
>  ;=====================================================================================
>  ; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
>  ;=====================================================================================
> diff -r 86268e498680 -r 584211b333ac source/common/x86/loopfilter.h
> --- a/source/common/x86/loopfilter.h    Wed Apr 22 00:00:39 2015 -0500
> +++ b/source/common/x86/loopfilter.h    Wed Apr 22 11:59:36 2015 +0530
> @@ -34,7 +34,6 @@
>  void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
>  void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
>  void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
> -void x265_saoCuOrgE3_2Rows_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff);
>  void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
>  void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
>  void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
> diff -r 86268e498680 -r 584211b333ac source/encoder/sao.cpp
> --- a/source/encoder/sao.cpp    Wed Apr 22 00:00:39 2015 -0500
> +++ b/source/encoder/sao.cpp    Wed Apr 22 11:59:36 2015 +0530
> @@ -516,41 +516,20 @@
>              if (rpelx == picWidth)
>                  upBuff1[ctuWidth - 1] = lastSign;
>
> -            int diff = endY - startY;
> -            for (y = 0; y < (diff >> 1); y++)
> -            {
> -                int8_t signDown, signDown0, upBuff[2];
> -                int edgeType1;
> -
> -                signDown = signOf(rec[startX] - tmpL[y * 2 + 1 + startY]);
> -                edgeType1 = signDown + upBuff1[startX] + 2;
> -                rec[startX] = m_clipTable[rec[startX] + m_offsetEo[edgeType1]];
> -
> -                signDown = signOf(rec[startX + stride] - tmpL[y * 2 + 2 + startY]);
> -                signDown0 = signOf(rec[startX + 1] - rec[startX + stride]);
> -                edgeType1 = signDown - signDown0 + 2;
> -                upBuff1[startX - 1] = -signDown;
> -
> -                upBuff[0] = signOf(rec[endX - 1 + stride] - rec[endX]);
> -                upBuff[1] = signOf(rec[endX - 1 + 2 * stride] - rec[endX + stride]);
> -
> -                primitives.saoCuOrgE3_2Rows(rec, upBuff1, m_offsetEo, stride - 1, startX, endX, upBuff);
> -
> -                rec[startX + stride] = m_clipTable[rec[startX + stride] + m_offsetEo[edgeType1]];
> -
> -                rec += 2 * stride;
> -            }
> -            if (diff & 1)
> -            {
> -                int8_t signDown1 = signOf(rec[startX] - tmpL[y * 2 + 1 + startY]);
> -                int edgeType = signDown1 + upBuff1[startX] + 2;
> -                upBuff1[startX - 1] = -signDown1;
> -                rec[startX] = m_clipTable[rec[startX] + m_offsetEo[edgeType]];
> -
> -                primitives.saoCuOrgE3(rec, upBuff1, m_offsetEo, stride - 1, startX, endX);
> -
> -                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
> -            }
> +            for (y = startY; y < endY; y++)
> +            {
> +                x = startX;
> +                int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
> +                int edgeType = signDown + upBuff1[x] + 2;
> +                upBuff1[x - 1] = -signDown;
> +                rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
> +
> +                primitives.saoCuOrgE3[endX > 16](rec, upBuff1, m_offsetEo, stride - 1, startX, endX);
> +
> +                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
> +
> +                rec += stride;
> +            }
>          }
>
>          break;
> diff -r 86268e498680 -r 584211b333ac source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp      Wed Apr 22 00:00:39 2015 -0500
> +++ b/source/test/pixelharness.cpp      Wed Apr 22 11:59:36 2015 +0530
> @@ -66,7 +66,7 @@
>          sbuf2[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
>          ibuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1;
>          psbuf1[i] = psbuf4[i] = (rand() % 65) - 32;                   // range is between -32 to 32
> -        psbuf2[i] = psbuf5[i] = psbuf6[i] = psbuf7[i] = (rand() % 3) - 1; // possible values {-1,0,1}
> +        psbuf2[i] = psbuf5[i] = (rand() % 3) - 1;                     // possible values {-1,0,1}
>          psbuf3[i] = (rand() % 129) - 128;
>          sbuf3[i] = rand() % PIXEL_MAX; // for blockcopy only
>      }
> @@ -1011,34 +1011,34 @@
>      return true;
>  }
>
> -bool PixelHarness::check_saoCuOrgE3_2Rows_t(saoCuOrgE3_2Rows_t ref, saoCuOrgE3_2Rows_t opt)
> -{
> -    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> -    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
> -
> -    memset(ref_dest, 0xCD, sizeof(ref_dest));
> -    memset(opt_dest, 0xCD, sizeof(opt_dest));
> -
> -    int j = 0;
> -
> -    for (int i = 0; i < ITERS; i++)
> -    {
> -        int stride = 16 * (rand() % 4 + 1);
> -        int start = rand() % 2;
> -        int end = (16 * (rand() % 4 + 1)) - rand() % 2;
> -
> -        ref(ref_dest, psbuf2 + j, psbuf1 + j, stride, start, end, psbuf6 + j);
> -        checked(opt, opt_dest, psbuf5 + j, psbuf1 + j, stride, start, end, psbuf7 + j);
> -
> -        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)) || memcmp(psbuf2, psbuf5, BUFFSIZE))
> -            return false;
> -
> -        reportfail();
> -        j += INCR;
> -    }
> -
> -    return true;
> -}
> +bool PixelHarness::check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt)
> +{
> +    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> +    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
> +
> +    memset(ref_dest, 0xCD, sizeof(ref_dest));
> +    memset(opt_dest, 0xCD, sizeof(opt_dest));
> +
> +    int j = 0;
> +
> +    for (int i = 0; i < ITERS; i++)
> +    {
> +        int stride = 32 * (rand() % 2 + 1);
> +        int start = rand() % 2;
> +        int end = (32 * (rand() % 2 + 1)) - rand() % 2;
> +
> +        ref(ref_dest, psbuf2 + j, psbuf1 + j, stride, start, end);
> +        checked(opt, opt_dest, psbuf5 + j, psbuf1 + j, stride, start, end);
> +
> +        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)) || memcmp(psbuf2, psbuf5, BUFFSIZE))
> +            return false;
> +
> +        reportfail();
> +        j += INCR;
> +    }
> +
> +    return true;
> +}
>
>  bool PixelHarness::check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt)
>  {
> @@ -1788,20 +1788,20 @@
>          }
>      }
>
> -    if (opt.saoCuOrgE3)
> +    if (opt.saoCuOrgE3[0])
>      {
> -        if (!check_saoCuOrgE3_t(ref.saoCuOrgE3, opt.saoCuOrgE3))
> +        if (!check_saoCuOrgE3_t(ref.saoCuOrgE3[0], opt.saoCuOrgE3[0]))
>          {
> -            printf("SAO_EO_3 failed\n");
> +            printf("SAO_EO_3[0] failed\n");
>              return false;
>          }
>      }
>
> -    if (opt.saoCuOrgE3_2Rows)
> +    if (opt.saoCuOrgE3[1])
>      {
> -        if (!check_saoCuOrgE3_2Rows_t(ref.saoCuOrgE3_2Rows, opt.saoCuOrgE3_2Rows))
> +        if (!check_saoCuOrgE3_32_t(ref.saoCuOrgE3[1], opt.saoCuOrgE3[1]))
>          {
> -            printf("SAO_EO_3_2Rows failed\n");
> +            printf("SAO_EO_3[1] failed\n");
>              return false;
>          }
>      }
> @@ -2192,16 +2192,16 @@
>          REPORT_SPEEDUP(opt.saoCuOrgE2, ref.saoCuOrgE2, pbuf1, psbuf1, psbuf2, psbuf3, 64, 64);
>      }
>
> -    if (opt.saoCuOrgE3)
> +    if (opt.saoCuOrgE3[0])
>      {
> -        HEADER0("SAO_EO_3");
> -        REPORT_SPEEDUP(opt.saoCuOrgE3, ref.saoCuOrgE3, pbuf1, psbuf2, psbuf1, 64, 0, 64);
> +        HEADER0("SAO_EO_3[0]");
> +        REPORT_SPEEDUP(opt.saoCuOrgE3[0], ref.saoCuOrgE3[0], pbuf1, psbuf2, psbuf1, 64, 0, 64);
>      }
>
> -    if (opt.saoCuOrgE3_2Rows)
> +    if (opt.saoCuOrgE3[1])
>      {
> -        HEADER0("SAO_EO_3_2Rows");
> -        REPORT_SPEEDUP(opt.saoCuOrgE3_2Rows, ref.saoCuOrgE3_2Rows, pbuf1, psbuf2, psbuf1, 64, 0, 64, psbuf6);
> +        HEADER0("SAO_EO_3[1]");
> +        REPORT_SPEEDUP(opt.saoCuOrgE3[1], ref.saoCuOrgE3[1], pbuf1, psbuf2, psbuf1, 64, 0, 64);
>      }
>
>      if (opt.saoCuOrgB0)
> diff -r 86268e498680 -r 584211b333ac source/test/pixelharness.h
> --- a/source/test/pixelharness.h        Wed Apr 22 00:00:39 2015 -0500
> +++ b/source/test/pixelharness.h        Wed Apr 22 11:59:36 2015 +0530
> @@ -51,8 +51,6 @@
>      int8_t   psbuf3[BUFFSIZE];
>      int8_t   psbuf4[BUFFSIZE];
>      int8_t   psbuf5[BUFFSIZE];
> -    int8_t   psbuf6[BUFFSIZE];
> -    int8_t   psbuf7[BUFFSIZE];
>
>      int16_t  sbuf1[BUFFSIZE];
>      int16_t  sbuf2[BUFFSIZE];
> @@ -100,7 +98,7 @@
>      bool check_saoCuOrgE1_t(saoCuOrgE1_t ref, saoCuOrgE1_t opt);
>      bool check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt);
>      bool check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
> -    bool check_saoCuOrgE3_2Rows_t(saoCuOrgE3_2Rows_t ref, saoCuOrgE3_2Rows_t opt);
> +    bool check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
>      bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
>      bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
>      bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel



-- 
Steve Borho


More information about the x265-devel mailing list