[x265] [PATCH] replace sse_sp(residual, ZERO) by ssd_s(residual)
Steve Borho
steve at borho.org
Wed Jul 16 01:32:51 CEST 2014
On 07/15, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1405471890 25200
> # Node ID 78f7b217e5d53ab981bb0b5ac0f43e8c46260c9f
> # Parent c923f4a9494619665bf49db7ae0e250e2f8c4ec7
> replace sse_sp(residual, ZERO) by ssd_s(residual)
>
> diff -r c923f4a94946 -r 78f7b217e5d5 source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Tue Jul 15 17:51:30 2014 -0700
> @@ -2374,9 +2374,8 @@
> if ((cu->getSlice()->getPPS()->getTransquantBypassEnableFlag()))
> {
> bIsTQBypassEnable = true; // mark that the first iteration is to cost TQB mode.
> - tqBypassMode = 2;
> - if (m_param->bLossless)
> - tqBypassMode = 1;
> + if (!m_param->bLossless)
> + tqBypassMode = 2;
The patch looks good except this part, I'd like Ashok to review this
change. it looks unrelated to the rest of the patch anyway.
> }
>
> uint64_t bestCost = MAX_INT64;
> @@ -2814,7 +2813,8 @@
> }
>
> int partSize = partitionFromLog2Size(log2TrSize);
> - uint32_t distY = primitives.sse_sp[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, (pixel*)RDCost::zeroPel, 0);
> + assert(log2TrSize <= 5);
We should be using X265_CHECK() instead of assert()
> + uint32_t distY = primitives.ssd_s[log2TrSize - 2](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width);
> uint32_t psyEnergyY = 0;
> if (m_rdCost.psyRdEnabled())
> {
> @@ -2923,7 +2923,7 @@
> int16_t *curResiU = m_qtTempShortYuv[qtLayer].getCbAddr(absPartIdxC);
> int16_t *curResiV = m_qtTempShortYuv[qtLayer].getCrAddr(absPartIdxC);
>
> - distU = m_rdCost.scaleChromaDistCb(primitives.sse_sp[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, 0));
> + distU = m_rdCost.scaleChromaDistCb(primitives.ssd_s[log2TrSizeC - 2](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth));
>
> if (outZeroDist)
> *outZeroDist += distU;
> @@ -3008,7 +3008,7 @@
> if (!numSigU[tuIterator.section])
> primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
>
> - distV = m_rdCost.scaleChromaDistCr(primitives.sse_sp[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, 0));
> + distV = m_rdCost.scaleChromaDistCr(primitives.ssd_s[log2TrSizeC - 2](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth));
> if (outZeroDist)
> *outZeroDist += distV;
>
> diff -r c923f4a94946 -r 78f7b217e5d5 source/common/pixel.cpp
> --- a/source/common/pixel.cpp Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/common/pixel.cpp Tue Jul 15 17:51:30 2014 -0700
> @@ -375,6 +375,21 @@
> return cost;
> }
>
> +template<int size>
> +int pixel_ssd_s_c(short *a, intptr_t dstride)
> +{
> + int sum = 0;
> + for (int y = 0; y < size; y++)
> + {
> + for (int x = 0; x < size; x++)
> + {
> + sum += a[x] * a[x];
> + }
> + a += dstride;
> + }
> + return sum;
> +}
> +
> void blockcopy_p_p(int bx, int by, pixel *a, intptr_t stridea, pixel *b, intptr_t strideb)
> {
> for (int y = 0; y < by; y++)
> @@ -1200,6 +1215,11 @@
> p.transpose[BLOCK_32x32] = transpose<32>;
> p.transpose[BLOCK_64x64] = transpose<64>;
>
> + p.ssd_s[BLOCK_4x4] = pixel_ssd_s_c<4>;
> + p.ssd_s[BLOCK_8x8] = pixel_ssd_s_c<8>;
> + p.ssd_s[BLOCK_16x16] = pixel_ssd_s_c<16>;
> + p.ssd_s[BLOCK_32x32] = pixel_ssd_s_c<32>;
> +
> p.weight_pp = weight_pp_c;
> p.weight_sp = weight_sp_c;
>
> diff -r c923f4a94946 -r 78f7b217e5d5 source/common/primitives.h
> --- a/source/common/primitives.h Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/common/primitives.h Tue Jul 15 17:51:30 2014 -0700
> @@ -130,6 +130,7 @@
> typedef int (*pixelcmp_t)(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride); // fenc is aligned
> typedef int (*pixelcmp_ss_t)(int16_t *fenc, intptr_t fencstride, int16_t *fref, intptr_t frefstride);
> typedef int (*pixelcmp_sp_t)(int16_t *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride);
> +typedef int (*pixel_ssd_s_t)(int16_t *fenc, intptr_t fencstride);
> typedef void (*pixelcmp_x4_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, pixel *fref3, intptr_t frefstride, int32_t *res);
> typedef void (*pixelcmp_x3_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, intptr_t frefstride, int32_t *res);
> typedef void (*blockcpy_pp_t)(int bx, int by, pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
> @@ -204,6 +205,7 @@
> pixelcmp_t sse_pp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (pixel, pixel) fenc alignment not assumed
> pixelcmp_ss_t sse_ss[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, short) fenc alignment not assumed
> pixelcmp_sp_t sse_sp[NUM_LUMA_PARTITIONS]; // Sum of Square Error (short, pixel) fenc alignment not assumed
> + pixel_ssd_s_t ssd_s[NUM_SQUARE_BLOCKS - 1]; // Sum of Square Error (short) fenc alignment not assumed
> pixelcmp_t satd[NUM_LUMA_PARTITIONS]; // Sum of Transformed differences (HADAMARD)
> pixelcmp_t sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions
> pixelcmp_t sa8d[NUM_SQUARE_BLOCKS]; // sa8d primitives for square intra blocks
> diff -r c923f4a94946 -r 78f7b217e5d5 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/common/x86/asm-primitives.cpp Tue Jul 15 17:51:30 2014 -0700
> @@ -1024,6 +1024,12 @@
> p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
> p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
>
> + // TODO: overflow on 12-bits mode!
> + p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
> + p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
> + p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
> + p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
> +
> p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
> p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
> p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse2;
> @@ -1156,6 +1162,11 @@
> p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
> p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_sse2;
>
> + p.ssd_s[BLOCK_4x4] = x265_pixel_ssd_s_4_sse2;
> + p.ssd_s[BLOCK_8x8] = x265_pixel_ssd_s_8_sse2;
> + p.ssd_s[BLOCK_16x16] = x265_pixel_ssd_s_16_sse2;
> + p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_sse2;
> +
> p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
> SA8D_INTER_FROM_BLOCK(sse2);
>
> @@ -1315,6 +1326,7 @@
> INIT2_NAME(sse_pp, ssd, _avx2);
> p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx2;
> p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx2;
> + p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_avx2;
> }
> #endif // if HIGH_BIT_DEPTH
> }
> diff -r c923f4a94946 -r 78f7b217e5d5 source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/common/x86/pixel.h Tue Jul 15 17:51:30 2014 -0700
> @@ -166,6 +166,12 @@
> int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t);
> int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
>
> +int x265_pixel_ssd_s_4_sse2(int16_t *, intptr_t);
> +int x265_pixel_ssd_s_8_sse2(int16_t *, intptr_t);
> +int x265_pixel_ssd_s_16_sse2(int16_t *, intptr_t);
> +int x265_pixel_ssd_s_32_sse2(int16_t *, intptr_t);
> +int x265_pixel_ssd_s_32_avx2(int16_t *, intptr_t);
> +
> #define ADDAVG(func) \
> void x265_ ## func ## _sse4(int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
> ADDAVG(addAvg_2x4)
> diff -r c923f4a94946 -r 78f7b217e5d5 source/common/x86/ssd-a.asm
> --- a/source/common/x86/ssd-a.asm Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/common/x86/ssd-a.asm Tue Jul 15 17:51:30 2014 -0700
> @@ -2395,3 +2395,224 @@
> HADDD m7, m1
> movd eax, m7
> RET
> +
> +
> +;-----------------------------------------------------------------------------
> +; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
> +;-----------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal pixel_ssd_s_4, 2,2,2
> + add r1, r1
> + movh m0, [r0]
> + movhps m0, [r0 + r1]
> +
> + lea r0, [r0 + r1 * 2]
> + movh m1, [r0]
> + movhps m1, [r0 + r1]
> +
> + pmaddwd m0, m0
> + pmaddwd m1, m1
> +
> + ; calculate sum
> + paddd m0, m1
> + movhlps m1, m0
> + paddd m0, m1
> + pshufd m1, m0, 1
> + paddd m0, m1
> +
> + movd eax, m0
> + RET
> +
> +
> +INIT_XMM sse2
> +cglobal pixel_ssd_s_8, 2,3,5
> + add r1, r1
> + lea r2, [r1 * 3]
> + movu m0, [r0]
> + movu m1, [r0 + r1]
> + movu m2, [r0 + r1 * 2]
> + movu m3, [r0 + r2]
> +
> + pmaddwd m0, m0
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + pmaddwd m3, m3
> + paddd m0, m1
> + paddd m2, m3
> + paddd m0, m2
> +
> + lea r0, [r0 + r1 * 4]
> + movu m4, [r0]
> + movu m1, [r0 + r1]
> + movu m2, [r0 + r1 * 2]
> + movu m3, [r0 + r2]
> +
> + pmaddwd m4, m4
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + pmaddwd m3, m3
> + paddd m4, m1
> + paddd m2, m3
> + paddd m4, m2
> +
> + ; calculate sum
> + paddd m0, m4
> + movhlps m1, m0
> + paddd m0, m1
> + pshufd m1, m0, 1
> + paddd m0, m1
> +
> + movd eax, m0
> + RET
> +
> +
> +INIT_XMM sse2
> +cglobal pixel_ssd_s_16, 2,3,5
> + add r1, r1
> +
> + mov r2d, 4
> + pxor m0, m0
> +.loop:
> + movu m1, [r0]
> + movu m2, [r0 + mmsize]
> + movu m3, [r0 + r1]
> + movu m4, [r0 + r1 + mmsize]
> + lea r0, [r0 + r1 * 2]
> +
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + pmaddwd m3, m3
> + pmaddwd m4, m4
> + paddd m1, m2
> + paddd m3, m4
> + paddd m1, m3
> + paddd m0, m1
> +
> + movu m1, [r0]
> + movu m2, [r0 + mmsize]
> + movu m3, [r0 + r1]
> + movu m4, [r0 + r1 + mmsize]
> + lea r0, [r0 + r1 * 2]
> +
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + pmaddwd m3, m3
> + pmaddwd m4, m4
> + paddd m1, m2
> + paddd m3, m4
> + paddd m1, m3
> + paddd m0, m1
> +
> + dec r2d
> + jnz .loop
> +
> + ; calculate sum
> + movhlps m1, m0
> + paddd m0, m1
> + pshufd m1, m0, 1
> + paddd m0, m1
> +
> + movd eax, m0
> + RET
> +
> +
> +INIT_XMM sse2
> +cglobal pixel_ssd_s_32, 2,3,5
> + add r1, r1
> +
> + mov r2d, 16
> + pxor m0, m0
> +.loop:
> + movu m1, [r0 + 0 * mmsize]
> + movu m2, [r0 + 1 * mmsize]
> + movu m3, [r0 + 2 * mmsize]
> + movu m4, [r0 + 3 * mmsize]
> + add r0, r1
> +
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + pmaddwd m3, m3
> + pmaddwd m4, m4
> + paddd m1, m2
> + paddd m3, m4
> + paddd m1, m3
> + paddd m0, m1
> +
> + movu m1, [r0 + 0 * mmsize]
> + movu m2, [r0 + 1 * mmsize]
> + movu m3, [r0 + 2 * mmsize]
> + movu m4, [r0 + 3 * mmsize]
> + add r0, r1
> +
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + pmaddwd m3, m3
> + pmaddwd m4, m4
> + paddd m1, m2
> + paddd m3, m4
> + paddd m1, m3
> + paddd m0, m1
> +
> + dec r2d
> + jnz .loop
> +
> + ; calculate sum
> + movhlps m1, m0
> + paddd m0, m1
> + pshufd m1, m0, 1
> + paddd m0, m1
> +
> + movd eax, m0
> + RET
> +
> +
> +INIT_YMM avx2
> +cglobal pixel_ssd_s_32, 2,4,5
> + add r1, r1
> + lea r3, [r1 * 3]
> +
> + mov r2d, 8
> + pxor m0, m0
> +.loop:
> + movu m1, [r0 + 0 * mmsize]
> + movu m2, [r0 + 1 * mmsize]
> + movu m3, [r0 + r1 + 0 * mmsize]
> + movu m4, [r0 + r1 + 1 * mmsize]
> +
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + pmaddwd m3, m3
> + pmaddwd m4, m4
> + paddd m1, m2
> + paddd m3, m4
> + paddd m1, m3
> + paddd m0, m1
> +
> + movu m1, [r0 + r1 * 2 + 0 * mmsize]
> + movu m2, [r0 + r1 * 2 + 1 * mmsize]
> + movu m3, [r0 + r3 + 0 * mmsize]
> + movu m4, [r0 + r3 + 1 * mmsize]
> + lea r0, [r0 + 4 * r1]
> +
> + pmaddwd m1, m1
> + pmaddwd m2, m2
> + pmaddwd m3, m3
> + pmaddwd m4, m4
> + paddd m1, m2
> + paddd m3, m4
> + paddd m1, m3
> + paddd m0, m1
> +
> + dec r2d
> + jnz .loop
> +
> + ; calculate sum
> + vextracti128 xm1, m0, 1
> + paddd xm0, xm1
> + movhlps xm1, xm0
> + paddd xm0, xm1
> + pshufd xm1, xm0, 1
> + paddd xm0, xm1
> +
> + movd eax, xm0
> + RET
> diff -r c923f4a94946 -r 78f7b217e5d5 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/test/pixelharness.cpp Tue Jul 15 17:51:30 2014 -0700
> @@ -394,6 +394,28 @@
> return true;
> }
>
> +bool PixelHarness::check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt)
> +{
> + int j = 0;
> + for (int i = 0; i < ITERS; i++)
> + {
> + // NOTE: stride must be multiple of 16, because minimum block is 4x4
> + int stride = (STRIDE + (rand() % STRIDE)) & ~15;
> + int cres = ref(sbuf1 + j, stride);
> + int vres = (int)checked(opt, sbuf1 + j, (intptr_t)stride);
> +
> + if (cres != vres)
> + {
> + return false;
> + }
> +
> + reportfail();
> + j += INCR;
> + }
> +
> + return true;
> +}
> +
> bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt)
> {
> ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> @@ -1312,6 +1334,15 @@
> }
> }
>
> + if ((i <= BLOCK_32x32) && opt.ssd_s[i])
> + {
> + if (!check_ssd_s(ref.ssd_s[i], opt.ssd_s[i]))
> + {
> + printf("ssd_s[%dx%d]: failed!\n", 4 << i, 4 << i);
> + return false;
> + }
> + }
> +
> if (opt.blockfill_s[i])
> {
> if (!check_blockfill_s(ref.blockfill_s[i], opt.blockfill_s[i]))
> @@ -1656,6 +1687,11 @@
>
> for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
> {
> + if ((i <= BLOCK_32x32) && opt.ssd_s[i])
> + {
> + HEADER("ssd_s[%dx%d]", 4 << i, 4 << i);
> + REPORT_SPEEDUP(opt.ssd_s[i], ref.ssd_s[i], sbuf1, STRIDE);
> + }
> if (opt.sa8d[i])
> {
> HEADER("sa8d[%dx%d]", 4 << i, 4 << i);
> diff -r c923f4a94946 -r 78f7b217e5d5 source/test/pixelharness.h
> --- a/source/test/pixelharness.h Mon Jul 14 17:27:04 2014 +0530
> +++ b/source/test/pixelharness.h Tue Jul 15 17:51:30 2014 -0700
> @@ -53,6 +53,7 @@
> bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt);
> bool check_pixeladd_ss(pixeladd_ss_t ref, pixeladd_ss_t opt);
> bool check_scale_pp(scale_t ref, scale_t opt);
> + bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
> bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);
> bool check_calresidual(calcresidual_t ref, calcresidual_t opt);
> bool check_calcrecon(calcrecon_t ref, calcrecon_t opt);
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list