[x265] [PATCH] replace sse_sp(residual, ZERO) by ssd_s(residual)

Steve Borho steve at borho.org
Wed Jul 16 01:32:51 CEST 2014


On 07/15, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1405471890 25200
> # Node ID 78f7b217e5d53ab981bb0b5ac0f43e8c46260c9f
> # Parent  c923f4a9494619665bf49db7ae0e250e2f8c4ec7
> replace sse_sp(residual, ZERO) by ssd_s(residual)
> 
> diff -r c923f4a94946 -r 78f7b217e5d5 source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp	Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Tue Jul 15 17:51:30 2014 -0700
> @@ -2374,9 +2374,8 @@
>      if ((cu->getSlice()->getPPS()->getTransquantBypassEnableFlag()))
>      {
>          bIsTQBypassEnable = true; // mark that the first iteration is to cost TQB mode.
> -        tqBypassMode = 2;
> -        if (m_param->bLossless)
> -            tqBypassMode = 1;
> +        if (!m_param->bLossless)
> +            tqBypassMode = 2;

The patch looks good except this part, I'd like Ashok to review this
change. it looks unrelated to the rest of the patch anyway.

>      }
>  
>      uint64_t bestCost = MAX_INT64;
> @@ -2814,7 +2813,8 @@
>          }
>  
>          int partSize = partitionFromLog2Size(log2TrSize);
> -        uint32_t distY = primitives.sse_sp[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, (pixel*)RDCost::zeroPel, 0);
> +        assert(log2TrSize <= 5);

We should be using X265_CHECK() instead of assert()

> +        uint32_t distY = primitives.ssd_s[log2TrSize - 2](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width);
>          uint32_t psyEnergyY = 0;
>          if (m_rdCost.psyRdEnabled())
>          {
> @@ -2923,7 +2923,7 @@
>                  int16_t *curResiU = m_qtTempShortYuv[qtLayer].getCbAddr(absPartIdxC);
>                  int16_t *curResiV = m_qtTempShortYuv[qtLayer].getCrAddr(absPartIdxC);
>  
> -                distU = m_rdCost.scaleChromaDistCb(primitives.sse_sp[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, 0));
> +                distU = m_rdCost.scaleChromaDistCb(primitives.ssd_s[log2TrSizeC - 2](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth));
>  
>                  if (outZeroDist)
>                      *outZeroDist += distU;
> @@ -3008,7 +3008,7 @@
>                  if (!numSigU[tuIterator.section])
>                      primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
>  
> -                distV = m_rdCost.scaleChromaDistCr(primitives.sse_sp[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, 0));
> +                distV = m_rdCost.scaleChromaDistCr(primitives.ssd_s[log2TrSizeC - 2](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth));
>                  if (outZeroDist)
>                      *outZeroDist += distV;
>  
> diff -r c923f4a94946 -r 78f7b217e5d5 source/common/pixel.cpp
> --- a/source/common/pixel.cpp	Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/common/pixel.cpp	Tue Jul 15 17:51:30 2014 -0700
> @@ -375,6 +375,21 @@
>      return cost;
>  }
>  
> +template<int size>
> +int pixel_ssd_s_c(short *a, intptr_t dstride)
> +{
> +    int sum = 0;
> +    for (int y = 0; y < size; y++)
> +    {
> +        for (int x = 0; x < size; x++)
> +        {
> +            sum += a[x] * a[x];
> +        }
> +        a += dstride;
> +    }
> +    return sum;
> +}
> +
>  void blockcopy_p_p(int bx, int by, pixel *a, intptr_t stridea, pixel *b, intptr_t strideb)
>  {
>      for (int y = 0; y < by; y++)
> @@ -1200,6 +1215,11 @@
>      p.transpose[BLOCK_32x32] = transpose<32>;
>      p.transpose[BLOCK_64x64] = transpose<64>;
>  
> +    p.ssd_s[BLOCK_4x4] = pixel_ssd_s_c<4>;
> +    p.ssd_s[BLOCK_8x8] = pixel_ssd_s_c<8>;
> +    p.ssd_s[BLOCK_16x16] = pixel_ssd_s_c<16>;
> +    p.ssd_s[BLOCK_32x32] = pixel_ssd_s_c<32>;
> +
>      p.weight_pp = weight_pp_c;
>      p.weight_sp = weight_sp_c;
>  
> diff -r c923f4a94946 -r 78f7b217e5d5 source/common/primitives.h
> --- a/source/common/primitives.h	Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/common/primitives.h	Tue Jul 15 17:51:30 2014 -0700
> @@ -130,6 +130,7 @@
>  typedef int  (*pixelcmp_t)(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); // fenc is aligned
>  typedef int  (*pixelcmp_ss_t)(int16_t *fenc, intptr_t fencstride, int16_t *fref, intptr_t frefstride);
>  typedef int  (*pixelcmp_sp_t)(int16_t *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride);
> +typedef int  (*pixel_ssd_s_t)(int16_t *fenc, intptr_t fencstride);
>  typedef void (*pixelcmp_x4_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res);
>  typedef void (*pixelcmp_x3_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, intptr_t frefstride, int32_t *res);
>  typedef void (*blockcpy_pp_t)(int bx, int by, pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
> @@ -204,6 +205,7 @@
>      pixelcmp_t      sse_pp[NUM_LUMA_PARTITIONS];     // Sum of Square Error (pixel, pixel) fenc alignment not assumed
>      pixelcmp_ss_t   sse_ss[NUM_LUMA_PARTITIONS];     // Sum of Square Error (short, short) fenc alignment not assumed
>      pixelcmp_sp_t   sse_sp[NUM_LUMA_PARTITIONS];     // Sum of Square Error (short, pixel) fenc alignment not assumed
> +    pixel_ssd_s_t   ssd_s[NUM_SQUARE_BLOCKS - 1];    // Sum of Square Error (short) fenc alignment not assumed
>      pixelcmp_t      satd[NUM_LUMA_PARTITIONS];       // Sum of Transformed differences (HADAMARD)
>      pixelcmp_t      sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions
>      pixelcmp_t      sa8d[NUM_SQUARE_BLOCKS];         // sa8d primitives for square intra blocks
> diff -r c923f4a94946 -r 78f7b217e5d5 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/common/x86/asm-primitives.cpp	Tue Jul 15 17:51:30 2014 -0700
> @@ -1024,6 +1024,12 @@
>          p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
>          p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
>  
> +        // TODO: overflow on 12-bits mode!
> +        p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
> +        p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
> +        p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
> +        p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
> +
>          p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
>          p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
>          p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse2;
> @@ -1156,6 +1162,11 @@
>          p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
>          p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
>  
> +        p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
> +        p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
> +        p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
> +        p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
> +
>          p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
>          SA8D_INTER_FROM_BLOCK(sse2);
>  
> @@ -1315,6 +1326,7 @@
>          INIT2_NAME(sse_pp, ssd, _avx2);
>          p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx2;
>          p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx2;
> +        p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_avx2;
>      }
>  #endif // if HIGH_BIT_DEPTH
>  }
> diff -r c923f4a94946 -r 78f7b217e5d5 source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h	Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/common/x86/pixel.h	Tue Jul 15 17:51:30 2014 -0700
> @@ -166,6 +166,12 @@
>  int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t);
>  int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
>  
> +int x265_pixel_ssd_s_4_sse2(int16_t *, intptr_t);
> +int x265_pixel_ssd_s_8_sse2(int16_t *, intptr_t);
> +int x265_pixel_ssd_s_16_sse2(int16_t *, intptr_t);
> +int x265_pixel_ssd_s_32_sse2(int16_t *, intptr_t);
> +int x265_pixel_ssd_s_32_avx2(int16_t *, intptr_t);
> +
>  #define ADDAVG(func)  \
>      void x265_ ## func ## _sse4(int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
>  ADDAVG(addAvg_2x4)
> diff -r c923f4a94946 -r 78f7b217e5d5 source/common/x86/ssd-a.asm
> --- a/source/common/x86/ssd-a.asm	Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/common/x86/ssd-a.asm	Tue Jul 15 17:51:30 2014 -0700
> @@ -2395,3 +2395,224 @@
>      HADDD    m7,     m1
>      movd     eax,    m7
>      RET
> +
> +
> +;-----------------------------------------------------------------------------
> +; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
> +;-----------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal pixel_ssd_s_4, 2,2,2
> +    add     r1, r1
> +    movh    m0, [r0]
> +    movhps  m0, [r0 + r1]
> +
> +    lea     r0, [r0 + r1 * 2]
> +    movh    m1, [r0]
> +    movhps  m1, [r0 + r1]
> +
> +    pmaddwd m0, m0
> +    pmaddwd m1, m1
> +
> +    ; calculate sum
> +    paddd   m0, m1
> +    movhlps m1, m0
> +    paddd   m0, m1
> +    pshufd  m1, m0, 1
> +    paddd   m0, m1
> +
> +    movd    eax, m0
> +    RET
> +
> +
> +INIT_XMM sse2
> +cglobal pixel_ssd_s_8, 2,3,5
> +    add     r1, r1
> +    lea     r2, [r1 * 3]
> +    movu    m0, [r0]
> +    movu    m1, [r0 + r1]
> +    movu    m2, [r0 + r1 * 2]
> +    movu    m3, [r0 + r2]
> +
> +    pmaddwd m0, m0
> +    pmaddwd m1, m1
> +    pmaddwd m2, m2
> +    pmaddwd m3, m3
> +    paddd   m0, m1
> +    paddd   m2, m3
> +    paddd   m0, m2
> +
> +    lea     r0, [r0 + r1 * 4]
> +    movu    m4, [r0]
> +    movu    m1, [r0 + r1]
> +    movu    m2, [r0 + r1 * 2]
> +    movu    m3, [r0 + r2]
> +
> +    pmaddwd m4, m4
> +    pmaddwd m1, m1
> +    pmaddwd m2, m2
> +    pmaddwd m3, m3
> +    paddd   m4, m1
> +    paddd   m2, m3
> +    paddd   m4, m2
> +
> +    ; calculate sum
> +    paddd   m0, m4
> +    movhlps m1, m0
> +    paddd   m0, m1
> +    pshufd  m1, m0, 1
> +    paddd   m0, m1
> +
> +    movd    eax, m0
> +    RET
> +
> +
> +INIT_XMM sse2
> +cglobal pixel_ssd_s_16, 2,3,5
> +    add     r1, r1
> +
> +    mov     r2d, 4
> +    pxor    m0, m0
> +.loop:
> +    movu    m1, [r0]
> +    movu    m2, [r0 + mmsize]
> +    movu    m3, [r0 + r1]
> +    movu    m4, [r0 + r1 + mmsize]
> +    lea     r0, [r0 + r1 * 2]
> +
> +    pmaddwd m1, m1
> +    pmaddwd m2, m2
> +    pmaddwd m3, m3
> +    pmaddwd m4, m4
> +    paddd   m1, m2
> +    paddd   m3, m4
> +    paddd   m1, m3
> +    paddd   m0, m1
> +
> +    movu    m1, [r0]
> +    movu    m2, [r0 + mmsize]
> +    movu    m3, [r0 + r1]
> +    movu    m4, [r0 + r1 + mmsize]
> +    lea     r0, [r0 + r1 * 2]
> +
> +    pmaddwd m1, m1
> +    pmaddwd m2, m2
> +    pmaddwd m3, m3
> +    pmaddwd m4, m4
> +    paddd   m1, m2
> +    paddd   m3, m4
> +    paddd   m1, m3
> +    paddd   m0, m1
> +
> +    dec     r2d
> +    jnz    .loop
> +
> +    ; calculate sum
> +    movhlps m1, m0
> +    paddd   m0, m1
> +    pshufd  m1, m0, 1
> +    paddd   m0, m1
> +
> +    movd    eax, m0
> +    RET
> +
> +
> +INIT_XMM sse2
> +cglobal pixel_ssd_s_32, 2,3,5
> +    add     r1, r1
> +
> +    mov     r2d, 16
> +    pxor    m0, m0
> +.loop:
> +    movu    m1, [r0 + 0 * mmsize]
> +    movu    m2, [r0 + 1 * mmsize]
> +    movu    m3, [r0 + 2 * mmsize]
> +    movu    m4, [r0 + 3 * mmsize]
> +    add     r0, r1
> +
> +    pmaddwd m1, m1
> +    pmaddwd m2, m2
> +    pmaddwd m3, m3
> +    pmaddwd m4, m4
> +    paddd   m1, m2
> +    paddd   m3, m4
> +    paddd   m1, m3
> +    paddd   m0, m1
> +
> +    movu    m1, [r0 + 0 * mmsize]
> +    movu    m2, [r0 + 1 * mmsize]
> +    movu    m3, [r0 + 2 * mmsize]
> +    movu    m4, [r0 + 3 * mmsize]
> +    add     r0, r1
> +
> +    pmaddwd m1, m1
> +    pmaddwd m2, m2
> +    pmaddwd m3, m3
> +    pmaddwd m4, m4
> +    paddd   m1, m2
> +    paddd   m3, m4
> +    paddd   m1, m3
> +    paddd   m0, m1
> +
> +    dec     r2d
> +    jnz    .loop
> +
> +    ; calculate sum
> +    movhlps m1, m0
> +    paddd   m0, m1
> +    pshufd  m1, m0, 1
> +    paddd   m0, m1
> +
> +    movd    eax, m0
> +    RET
> +
> +
> +INIT_YMM avx2
> +cglobal pixel_ssd_s_32, 2,4,5
> +    add     r1, r1
> +    lea     r3, [r1 * 3]
> +
> +    mov     r2d, 8
> +    pxor    m0, m0
> +.loop:
> +    movu    m1, [r0 + 0 * mmsize]
> +    movu    m2, [r0 + 1 * mmsize]
> +    movu    m3, [r0 + r1 + 0 * mmsize]
> +    movu    m4, [r0 + r1 + 1 * mmsize]
> +
> +    pmaddwd m1, m1
> +    pmaddwd m2, m2
> +    pmaddwd m3, m3
> +    pmaddwd m4, m4
> +    paddd   m1, m2
> +    paddd   m3, m4
> +    paddd   m1, m3
> +    paddd   m0, m1
> +
> +    movu    m1, [r0 + r1 * 2 + 0 * mmsize]
> +    movu    m2, [r0 + r1 * 2 + 1 * mmsize]
> +    movu    m3, [r0 + r3 + 0 * mmsize]
> +    movu    m4, [r0 + r3 + 1 * mmsize]
> +    lea     r0, [r0 + 4 * r1]
> +
> +    pmaddwd m1, m1
> +    pmaddwd m2, m2
> +    pmaddwd m3, m3
> +    pmaddwd m4, m4
> +    paddd   m1, m2
> +    paddd   m3, m4
> +    paddd   m1, m3
> +    paddd   m0, m1
> +
> +    dec     r2d
> +    jnz    .loop
> +
> +    ; calculate sum
> +    vextracti128 xm1, m0, 1
> +    paddd   xm0, xm1
> +    movhlps xm1, xm0
> +    paddd   xm0, xm1
> +    pshufd  xm1, xm0, 1
> +    paddd   xm0, xm1
> +
> +    movd    eax, xm0
> +    RET
> diff -r c923f4a94946 -r 78f7b217e5d5 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp	Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/test/pixelharness.cpp	Tue Jul 15 17:51:30 2014 -0700
> @@ -394,6 +394,28 @@
>      return true;
>  }
>  
> +bool PixelHarness::check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt)
> +{
> +    int j = 0;
> +    for (int i = 0; i < ITERS; i++)
> +    {
> +        // NOTE: stride must be multiple of 16, because minimum block is 4x4
> +        int stride = (STRIDE + (rand() % STRIDE)) & ~15;
> +        int cres = ref(sbuf1 + j, stride);
> +        int vres = (int)checked(opt, sbuf1 + j, (intptr_t)stride);
> +
> +        if (cres != vres)
> +        {
> +            return false;
> +        }
> +
> +        reportfail();
> +        j += INCR;
> +    }
> +
> +    return true;
> +}
> +
>  bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt)
>  {
>      ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> @@ -1312,6 +1334,15 @@
>              }
>          }
>  
> +        if ((i <= BLOCK_32x32) && opt.ssd_s[i])
> +        {
> +            if (!check_ssd_s(ref.ssd_s[i], opt.ssd_s[i]))
> +            {
> +                printf("ssd_s[%dx%d]: failed!\n", 4 << i, 4 << i);
> +                return false;
> +            }
> +        }
> +
>          if (opt.blockfill_s[i])
>          {
>              if (!check_blockfill_s(ref.blockfill_s[i], opt.blockfill_s[i]))
> @@ -1656,6 +1687,11 @@
>  
>      for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
>      {
> +        if ((i <= BLOCK_32x32) && opt.ssd_s[i])
> +        {
> +            HEADER("ssd_s[%dx%d]", 4 << i, 4 << i);
> +            REPORT_SPEEDUP(opt.ssd_s[i], ref.ssd_s[i], sbuf1, STRIDE);
> +        }
>          if (opt.sa8d[i])
>          {
>              HEADER("sa8d[%dx%d]", 4 << i, 4 << i);
> diff -r c923f4a94946 -r 78f7b217e5d5 source/test/pixelharness.h
> --- a/source/test/pixelharness.h	Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/test/pixelharness.h	Tue Jul 15 17:51:30 2014 -0700
> @@ -53,6 +53,7 @@
>      bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt);
>      bool check_pixeladd_ss(pixeladd_ss_t ref, pixeladd_ss_t opt);
>      bool check_scale_pp(scale_t ref, scale_t opt);
> +    bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
>      bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);
>      bool check_calresidual(calcresidual_t ref, calcresidual_t opt);
>      bool check_calcrecon(calcrecon_t ref, calcrecon_t opt);
> 
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list