[x265] [PATCH] count_nonzero primitive, downscaling quantCoeff from int32_t* to int16_t*
Deepthi Nandakumar
deepthi at multicorewareinc.com
Thu Aug 14 09:26:49 CEST 2014
Praveen,
Can you build a mercurial queue for these quant patches - so they can be
reviewed and pushed in once quant is 16-bit everywhere?
Thanks,
Deepthi
On Thu, Aug 14, 2014 at 2:01 AM, Steve Borho <steve at borho.org> wrote:
> On 08/12, praveen at multicorewareinc.com wrote:
> > # HG changeset patch
> > # User Praveen Tiwari
> > # Date 1407834530 -19800
> > # Node ID bb4d44663964237e4b66af6d92b2f13dbcf4f9b9
> > # Parent 8a7f4bb1d1be32fe668d410450c2e320ccae6098
> > count_nonzero primitive, downscaling quantCoeff from int32_t* to int16_t*
>
> There's not much point in applying these patches until all of the quant
> primitives are using short ints for coefficients. As-is this will just
> be a slow-down.
>
> > diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/dct.cpp
> > --- a/source/common/dct.cpp Tue Aug 12 01:11:39 2014 -0500
> > +++ b/source/common/dct.cpp Tue Aug 12 14:38:50 2014 +0530
> > @@ -815,7 +815,7 @@
> > return numSig;
> > }
> >
> > -int count_nonzero_c(const int32_t *quantCoeff, int numCoeff)
> > +int count_nonzero_c(const int16_t *quantCoeff, int numCoeff)
> > {
> > X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not
> aligned\n");
> > X265_CHECK(numCoeff > 0 && (numCoeff & 15) == 0, "numCoeff invalid
> %d\n", numCoeff);
> > diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/primitives.h
> > --- a/source/common/primitives.h Tue Aug 12 01:11:39 2014 -0500
> > +++ b/source/common/primitives.h Tue Aug 12 14:38:50 2014 +0530
> > @@ -163,7 +163,7 @@
> > typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff,
> int32_t *qCoef, int qBits, int add, int numCoeff);
> > typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t
> *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
> > typedef void (*dequant_normal_t)(const int32_t* quantCoef, int32_t*
> coef, int num, int scale, int shift);
> > -typedef int (*count_nonzero_t)(const int32_t *quantCoeff, int
> numCoeff);
> > +typedef int (*count_nonzero_t)(const int16_t *quantCoeff, int
> numCoeff);
> >
> > typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t
> srcStride, intptr_t dstStride, int width, int height, int w0, int round,
> int shift, int offset);
> > typedef void (*weightp_sp_t)(int16_t *src, pixel *dst, intptr_t
> srcStride, intptr_t dstStride, int width, int height, int w0, int round,
> int shift, int offset);
> > diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/quant.cpp
> > --- a/source/common/quant.cpp Tue Aug 12 01:11:39 2014 -0500
> > +++ b/source/common/quant.cpp Tue Aug 12 14:38:50 2014 +0530
> > @@ -2,6 +2,7 @@
> > * Copyright (C) 2014 x265 project
> > *
> > * Authors: Steve Borho <steve at borho.org>
> > + * Praveen Kumar Tiwari <praveen at multicorewareinc.com>
> > *
> > * This program is free software; you can redistribute it and/or modify
> > * it under the terms of the GNU General Public License as published by
> > @@ -463,7 +464,17 @@
> > const uint32_t sizeIdx = log2TrSize - 2;
> > int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
> >
> > - X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 <<
> log2TrSize * 2), "numSig differ\n");
> > + /* This section of code is to safely convert int32_t
> coefficients to int16_t, once the caller function is
> > + * optimize to take coefficients as int16_t*, it will be
> cleanse.*/
> > + int numCoeff = (1 << (log2TrSize * 2));
> > + assert(numCoeff <= 1024);
> > + ALIGN_VAR_16(int16_t, qCoeff[32 * 32]);
> > + for (int i = 0; i < numCoeff; i++)
> > + {
> > + qCoeff[i] = (coeff[i] & 0xFFFF);
> > + }
> > +
> > + X265_CHECK((int)numSig == primitives.count_nonzero(qCoeff, 1 <<
> log2TrSize * 2), "numSig differ\n");
> >
> > // DC only
> > if (numSig == 1 && coeff[0] != 0 && !useDST)
> > @@ -501,7 +512,16 @@
> > int numCoeff = 1 << log2TrSize * 2;
> > uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef,
> dstCoeff, qbits, add, numCoeff);
>
> These two loops are only here for an X265_CHECK statement that is
> usually compiled out. All of this code should have been wrapped within
> #if CHECKED_BUILD || _DEBUG
>
> > - X265_CHECK((int)numSig == primitives.count_nonzero(dstCoeff,
> numCoeff), "numSig differ\n");
> > + /* This section of code is to safely convert int32_t coefficients
> to int16_t, once the caller function is
> > + * optimize to take coefficients as int16_t*, it will be cleanse.*/
> > + assert(numCoeff <= 1024);
> > + ALIGN_VAR_16(int16_t, qCoeff[32 * 32]);
> > + for (int i = 0; i < numCoeff; i++)
> > + {
> > + qCoeff[i] = (dstCoeff[i] & 0xFFFF);
> > + }
> > +
> > + X265_CHECK((int)numSig == primitives.count_nonzero(qCoeff,
> numCoeff), "numSig differ\n");
> > if (!numSig)
> > return 0;
> >
> > diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/x86/pixel-util.h
> > --- a/source/common/x86/pixel-util.h Tue Aug 12 01:11:39 2014 -0500
> > +++ b/source/common/x86/pixel-util.h Tue Aug 12 14:38:50 2014 +0530
> > @@ -2,6 +2,7 @@
> > * Copyright (C) 2013 x265 project
> > *
> > * Authors: Steve Borho <steve at borho.org>
> > + * Praveen Kumar Tiwari <praveen at multicorewareinc.com>
> > *
> > * This program is free software; you can redistribute it and/or modify
> > * it under the terms of the GNU General Public License as published by
> > @@ -47,7 +48,7 @@
> > uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t
> *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
> > uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t
> *qCoef, int qBits, int add, int numCoeff);
> > void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef,
> int num, int scale, int shift);
> > -int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff);
> > +int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
> >
> > void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t srcStride,
> intptr_t dstStride, int width, int height, int w0, int round, int shift,
> int offset);
> > void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride,
> intptr_t dstStride, int width, int height, int w0, int round, int shift,
> int offset);
> > diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/x86/pixel-util8.asm
> > --- a/source/common/x86/pixel-util8.asm Tue Aug 12 01:11:39 2014
> -0500
> > +++ b/source/common/x86/pixel-util8.asm Tue Aug 12 14:38:50 2014
> +0530
> > @@ -3,6 +3,7 @@
> > ;*
> > ;* Authors: Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
> > ;* Nabajit Deka <nabajit at multicorewareinc.com>
> > +;* Praveen Kumar Tiwari <praveen at multicorewareinc.com>
> > ;*
> > ;* This program is free software; you can redistribute it and/or modify
> > ;* it under the terms of the GNU General Public License as published by
> > @@ -1091,10 +1092,10 @@
> >
> >
> >
> ;-----------------------------------------------------------------------------
> > -; int count_nonzero(const int32_t *quantCoeff, int numCoeff);
> > +; int count_nonzero(const int16_t *quantCoeff, int numCoeff);
> >
> ;-----------------------------------------------------------------------------
> > INIT_XMM ssse3
> > -cglobal count_nonzero, 2,2,5
> > +cglobal count_nonzero, 2,2,4
> > pxor m0, m0
> > shr r1d, 4
> > movd m1, r1d
> > @@ -1103,12 +1104,8 @@
> > .loop:
> > mova m2, [r0 + 0]
> > mova m3, [r0 + 16]
> > - packssdw m2, m3
> > - mova m3, [r0 + 32]
> > - mova m4, [r0 + 48]
> > - add r0, 64
> > - packssdw m3, m4
> > packsswb m2, m3
> > + add r0, 32
> > pcmpeqb m2, m0
> > paddb m1, m2
> > dec r1d
> > diff -r 8a7f4bb1d1be -r bb4d44663964 source/encoder/entropy.cpp
> > --- a/source/encoder/entropy.cpp Tue Aug 12 01:11:39 2014 -0500
> > +++ b/source/encoder/entropy.cpp Tue Aug 12 14:38:50 2014 +0530
> > @@ -2,6 +2,7 @@
> > * Copyright (C) 2013 x265 project
> > *
> > * Authors: Steve Borho <steve at borho.org>
> > +* Praveen Kumar Tiwari <praveen at multicorewareinc.com>
> > *
> > * This program is free software; you can redistribute it and/or modify
> > * it under the terms of the GNU General Public License as published by
> > @@ -1488,8 +1489,18 @@
> > {
> > uint32_t trSize = 1 << log2TrSize;
> >
> > + /* This section of code is to safely convert int32_t coefficients
> to int16_t, once the caller function is
> > + * optimize to take coefficients as int16_t*, it will be cleanse.*/
> > + int numCoeff = (1 << (log2TrSize << 1));
> > + assert(numCoeff <= 1024);
> > + ALIGN_VAR_16(int16_t, qCoeff[32 * 32]);
> > + for (int i = 0; i < numCoeff; i++)
> > + {
> > + qCoeff[i] = (coeff[i] & 0xFFFF);
> > + }
> > +
> > // compute number of significant coefficients
> > - uint32_t numSig = primitives.count_nonzero(coeff, (1 << (log2TrSize
> << 1)));
> > + uint32_t numSig = primitives.count_nonzero(qCoeff, (1 <<
> (log2TrSize << 1)));
> >
> > X265_CHECK(numSig > 0, "cbf check fail\n");
> >
> > diff -r 8a7f4bb1d1be -r bb4d44663964 source/test/mbdstharness.cpp
> > --- a/source/test/mbdstharness.cpp Tue Aug 12 01:11:39 2014 -0500
> > +++ b/source/test/mbdstharness.cpp Tue Aug 12 14:38:50 2014 +0530
> > @@ -366,7 +366,7 @@
> >
> > bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref,
> count_nonzero_t opt)
> > {
> > - ALIGN_VAR_32(int32_t, qcoeff[32 * 32]);
> > + ALIGN_VAR_32(int16_t, qcoeff[32 * 32]);
> >
> > for (int i = 0; i < 4; i++)
> > {
> > @@ -376,7 +376,7 @@
> >
> > for (int n = 0; n <= num; n++)
> > {
> > - memset(qcoeff, 0, num * sizeof(int32_t));
> > + memset(qcoeff, 0, num * sizeof(int16_t));
> >
> > for (int j = 0; j < n; j++)
> > {
> > @@ -386,7 +386,7 @@
> > k = (k + 11) & mask;
> > }
> >
> > - qcoeff[k] = rand() - RAND_MAX / 2;
> > + qcoeff[k] = (int16_t)rand() - RAND_MAX / 2;
> > }
> >
> > int refval = ref(qcoeff, num);
> > @@ -516,7 +516,7 @@
> > for (int i = 4; i <= 32; i <<= 1)
> > {
> > printf("count_nonzero[%dx%d]", i, i);
> > - REPORT_SPEEDUP(opt.count_nonzero, ref.count_nonzero,
> mbufidct, i * i)
> > + REPORT_SPEEDUP(opt.count_nonzero, ref.count_nonzero, mbuf1,
> i * i)
> > }
> > }
> > }
> > _______________________________________________
> > x265-devel mailing list
> > x265-devel at videolan.org
> > https://mailman.videolan.org/listinfo/x265-devel
>
> --
> Steve Borho
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140814/5ab8c47a/attachment-0001.html>
More information about the x265-devel
mailing list