[x265] [PATCH 6 of 6] asm: improve costCoeffRemain by bypass uncoded coeff
Deepthi Nandakumar
deepthi at multicorewareinc.com
Wed Jun 10 09:32:25 CEST 2015
Min, thanks for this series of patches, all pushed.
On Tue, Jun 9, 2015 at 11:36 PM, Min Chen <chenm003 at 163.com> wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1433872879 25200
> # Node ID e5b6f0a984bdd8ab16b63fb1c11a508a444515ec
> # Parent 134670771e0c1dd0800c3e9db0a1f9f69c467e36
> asm: improve costCoeffRemain by bypass uncoded coeff
> ---
> source/common/dct.cpp | 17 ++++----
> source/common/primitives.h | 2 +-
> source/common/x86/pixel-util.h | 2 +-
> source/common/x86/pixel-util8.asm | 41 +++++++++----------
> source/encoder/entropy.cpp | 82
> ++++++++++++++++++++++++++++++++-----
> 5 files changed, 101 insertions(+), 43 deletions(-)
>
> diff -r 134670771e0c -r e5b6f0a984bd source/common/dct.cpp
> --- a/source/common/dct.cpp Tue Jun 09 11:01:15 2015 -0700
> +++ b/source/common/dct.cpp Tue Jun 09 11:01:19 2015 -0700
> @@ -874,19 +874,19 @@
> return (sum & 0xFFFFFF);
> }
>
> -uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero)
> +uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero, int idx)
> {
> uint32_t goRiceParam = 0;
> - int firstCoeff2 = 1;
> - uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
>
> uint32_t sum = 0;
> - int idx = 0;
> + int baseLevel = 3;
> do
> {
> - int baseLevel = (baseLevelN & 3) | firstCoeff2;
> - X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 +
> firstCoeff2) : 1), "baseLevel check failurr\n");
> - baseLevelN >>= 2;
> + if (idx >= C1FLAG_NUMBER)
> + baseLevel = 1;
> +
> + // TODO: the IDX is not really idx, so this check inactive
> + //X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 +
> firstCoeff2) : 1), "baseLevel check failurr\n");
> int codeNumber = absCoeff[idx] - baseLevel;
>
> if (codeNumber >= 0)
> @@ -912,8 +912,7 @@
> goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
> X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
> }
> - if (absCoeff[idx] >= 2)
> - firstCoeff2 = 0;
> + baseLevel = 2;
> idx++;
> }
> while(idx < numNonZero);
> diff -r 134670771e0c -r e5b6f0a984bd source/common/primitives.h
> --- a/source/common/primitives.h Tue Jun 09 11:01:15 2015 -0700
> +++ b/source/common/primitives.h Tue Jun 09 11:01:19 2015 -0700
> @@ -187,7 +187,7 @@
> typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const
> intptr_t trSize, const uint16_t scanTbl[16]);
>
> typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t
> *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx,
> uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int
> subPosBase);
> -typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero);
> +typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero,
> int idx);
>
> /* Function pointers to optimized encoder primitives. Each pointer can
> reference
> * either an assembly routine, a SIMD intrinsic primitive, or a C
> function */
> diff -r 134670771e0c -r e5b6f0a984bd source/common/x86/pixel-util.h
> --- a/source/common/x86/pixel-util.h Tue Jun 09 11:01:15 2015 -0700
> +++ b/source/common/x86/pixel-util.h Tue Jun 09 11:01:19 2015 -0700
> @@ -83,7 +83,7 @@
> uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const
> intptr_t trSize, const uint16_t scanTbl[16]);
>
> uint32_t x265_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t
> *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx,
> uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int
> subPosBase);
> -uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero);
> +uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero,
> int idx);
>
>
> #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
> diff -r 134670771e0c -r e5b6f0a984bd source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Tue Jun 09 11:01:15 2015 -0700
> +++ b/source/common/x86/pixel-util8.asm Tue Jun 09 11:01:19 2015 -0700
> @@ -6572,7 +6572,7 @@
> ;}
> ;while(idx < numNonZero);
>
> -; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero)
> +; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero, int idx)
> INIT_XMM sse4
> cglobal costCoeffRemain, 0,7,1
> ; assign RCX to R3
> @@ -6580,48 +6580,43 @@
> %if WIN64
> DECLARE_REG_TMP 3,1,2,0
> mov t0, r0
> + mov r4d, r2d
> %elif ARCH_X86_64
> ; *nix x64 didn't do anything
> DECLARE_REG_TMP 0,1,2,3
> + mov r4d, r2d
> %else ; X86_32
> DECLARE_REG_TMP 6,3,2,1
> mov t0, r0m
> + mov r4d, r2m
> %endif
>
> - mova m0, [t0]
> - packsswb m0, [t0 + mmsize]
> - pcmpgtb m0, [pb_1]
> - pmovmskb r2d, m0
> - bsf r2d, r2d
> - lea r2d, [r2 * 2 + 1]
> - xor r4d, r4d
> - bts r4d, r2d
> - dec r4d
> - and r4d, 0x55555555
> - or r4d, 0x5555AAAA
> -
> xor t3d, t3d
> xor r5d, r5d
>
> + lea t0, [t0 + r4 * 2]
> + mov r2d, 3
> +
> ; register mapping
> - ; r4d - baseLevelN
> - ; r2 - tmp
> + ; r2d - baseLevel & tmp
> + ; r4d - idx
> ; t3 - goRiceParam
> - ; eax - tmp - absCoeff[idx]
> + ; eax - absCoeff[idx] & tmp
> ; r5 - sum
>
> .loop:
> + mov eax, 1
> + cmp r4d, 8
> + cmovge r2d, eax
> +
> movzx eax, word [t0]
> add t0, 2
> - mov r2d, r4d
> - and r2d, 3
> - shr r4d, 2
> sub eax, r2d ; codeNumber = absCoeff[idx] -
> baseLevel
> jl .next
>
> shr eax, t3b ; codeNumber =
> ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION
>
> - lea r2d, [eax - 3 + 1] ; CLZ(cidx, codeNumber + 1);
> + lea r2d, [rax - 3 + 1] ; CLZ(cidx, codeNumber + 1);
> bsr r2d, r2d
> add r2d, r2d ; codeNumber = (length + length)
>
> @@ -6644,8 +6639,10 @@
> add t3b, al
>
> .next:
> - dec dword r1m
> - jnz .loop
> + inc r4d
> + mov r2d, 2
> + cmp r4d, r1m
> + jl .loop
>
> mov eax, r5d
> RET
> diff -r 134670771e0c -r e5b6f0a984bd source/encoder/entropy.cpp
> --- a/source/encoder/entropy.cpp Tue Jun 09 11:01:15 2015 -0700
> +++ b/source/encoder/entropy.cpp Tue Jun 09 11:01:19 2015 -0700
> @@ -1431,6 +1431,55 @@
> encodeBin(cu.getCbf(absPartIdx, ttype, lowestTUDepth),
> m_contextState[OFF_QT_CBF_CTX + ctx]);
> }
>
> +#if CHECKED_BUILD || _DEBUG
> +uint32_t costCoeffRemain_c0(uint16_t *absCoeff, int numNonZero)
> +{
> + uint32_t goRiceParam = 0;
> + int firstCoeff2 = 1;
> + uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
> +
> + uint32_t sum = 0;
> + int idx = 0;
> + do
> + {
> + int baseLevel = (baseLevelN & 3) | firstCoeff2;
> + X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 +
> firstCoeff2) : 1), "baseLevel check failurr\n");
> + baseLevelN >>= 2;
> + int codeNumber = absCoeff[idx] - baseLevel;
> +
> + if (codeNumber >= 0)
> + {
> + //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel,
> goRiceParam);
> + uint32_t length = 0;
> +
> + codeNumber = ((uint32_t)codeNumber >> goRiceParam) -
> COEF_REMAIN_BIN_REDUCTION;
> + if (codeNumber >= 0)
> + {
> + {
> + unsigned long cidx;
> + CLZ(cidx, codeNumber + 1);
> + length = cidx;
> + }
> + X265_CHECK((codeNumber != 0) || (length == 0), "length
> check failure\n");
> +
> + codeNumber = (length + length);
> + }
> + sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam +
> codeNumber);
> +
> + if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION <<
> goRiceParam))
> + goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
> + X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
> + }
> + if (absCoeff[idx] >= 2)
> + firstCoeff2 = 0;
> + idx++;
> + }
> + while(idx < numNonZero);
> +
> + return sum;
> +}
> +#endif // debug only code
> +
> void Entropy::codeCoeffNxN(const CUData& cu, const coeff_t* coeff,
> uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype)
> {
> uint32_t trSize = 1 << log2TrSize;
> @@ -1519,7 +1568,7 @@
> uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX]
> : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
> uint32_t c1 = 1;
> int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
> - ALIGN_VAR_32(uint16_t, absCoeff[1 << MLS_CG_SIZE]);
> + ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);
> uint32_t numNonZero = 1;
> unsigned long lastNZPosInCG;
> unsigned long firstNZPosInCG;
> @@ -1700,6 +1749,7 @@
> uint32_t numC1Flag = X265_MIN(numNonZero, C1FLAG_NUMBER);
> X265_CHECK(numC1Flag > 0, "numC1Flag check failure\n");
>
> + uint32_t firstC2Idx = 8;
> uint32_t firstC2Flag = 2;
> uint32_t c1Next = 0xFFFFFFFE;
> if (!m_bitIf)
> @@ -1720,9 +1770,13 @@
>
> if (symbol1)
> c1Next = 0;
> +
> if (symbol1 + firstC2Flag == 3)
> firstC2Flag = symbol2;
>
> + if (symbol1 + firstC2Idx == 9)
> + firstC2Idx = idx;
> +
> c1 = (c1Next & 3);
> c1Next >>= 2;
> X265_CHECK(c1 <= 3, "c1 check failure\n");
> @@ -1749,9 +1803,10 @@
> //encodeBinsEP((coeffSigns >> hiddenShift), numNonZero -
> hiddenShift);
> m_fracBits += (numNonZero - hiddenShift) << 15;
>
> - if (!c1 || numNonZero > C1FLAG_NUMBER)
> + if (numNonZero > firstC2Idx)
> {
> - uint32_t sum = primitives.costCoeffRemain(absCoeff,
> numNonZero);
> + sum = primitives.costCoeffRemain(absCoeff,
> numNonZero, firstC2Idx);
> + X265_CHECK(sum == costCoeffRemain_c0(absCoeff,
> numNonZero), "costCoeffRemain check failure\n");
> m_fracBits += ((uint64_t)sum << 15);
> }
> }
> @@ -1771,6 +1826,9 @@
> if (symbol1 + firstC2Flag == 3)
> firstC2Flag = symbol2;
>
> + if (symbol1 + firstC2Idx == 9)
> + firstC2Idx = idx;
> +
> c1 = (c1Next & 3);
> c1Next >>= 2;
> X265_CHECK(c1 <= 3, "c1 check failure\n");
> @@ -1793,15 +1851,17 @@
> {
> // Standard path
> uint32_t goRiceParam = 0;
> + int baseLevel = 3;
> +#if CHECKED_BUILD || _DEBUG
> int firstCoeff2 = 1;
> - uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode
> format baseLevel
> -
> - idx = 0;
> +#endif
> + idx = firstC2Idx;
> do
> {
> - int baseLevel = (baseLevelN & 3) | firstCoeff2;
> + if (idx >= C1FLAG_NUMBER)
> + baseLevel = 1;
> + // TODO: fast algorithm maybe broken this check
> logic
> X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ?
> (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
> - baseLevelN >>= 2;
>
> if (absCoeff[idx] >= baseLevel)
> {
> @@ -1810,8 +1870,10 @@
> goRiceParam = (goRiceParam + 1) -
> (goRiceParam >> 2);
> X265_CHECK(goRiceParam <= 4, "goRiceParam
> check failure\n");
> }
> - if (absCoeff[idx] >= 2)
> - firstCoeff2 = 0;
> +#if CHECKED_BUILD || _DEBUG
> + firstCoeff2 = 0;
> +#endif
> + baseLevel = 2;
> idx++;
> }
> while(idx < numNonZero);
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150610/11e0941d/attachment-0001.html>
More information about the x265-devel
mailing list