[x265] [PATCH] Fix for C code mismatch

Thu Nov 20 17:44:43 CET 2014

On 11/20, praveen at multicorewareinc.com wrote:
> # HG changeset patch
> # User Praveen Tiwari
> # Date 1416494042 -19800
> # Node ID b2b6021639875ebe7398d7725592206c1f33e613
> # Parent  ed587d360b97624d8dc5b738d08029ffe4f8d92d
> Fix for C code mismatch
> 
> This patch is for fix the the binary mismatch in incoded output intoduced during
> refactorizaton of the transform/quant path. Basically it is original version of
> code to make sure all valid inputs are copied in input buffer, in other hand
> it is not fully optimized code but this patch is quick fix for the problem and
> allow us to optimze one function at a time.

queued with tpyos fixed

> diff -r ed587d360b97 -r b2b602163987 source/common/dct.cpp
> --- a/source/common/dct.cpp	Thu Nov 20 18:25:09 2014 +0900
> +++ b/source/common/dct.cpp	Thu Nov 20 20:04:02 2014 +0530
> @@ -454,51 +454,129 @@
>      }
>  
>      fastForwardDst(block, coef, shift_1st);
> -    fastForwardDst(coef, dst, shift_2nd);
> +    fastForwardDst(coef, block, shift_2nd);
> +
> +#define N (4)
> +    for (int i = 0; i < N; i++)
> +    {
> +        for (int j = 0; j < N; j++)
> +        {
> +            dst[i * N + j] = block[i * N + j];
> +        }
> +    }
> +
> +#undef N
>  }
>  
> -void dct4_c(const int16_t *src, int16_t *dst, intptr_t /* stride */)
> +void dct4_c(const int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      const int shift_1st = 1 + X265_DEPTH - 8;
>      const int shift_2nd = 8;
>  
>      ALIGN_VAR_32(int16_t, coef[4 * 4]);
> +    ALIGN_VAR_32(int16_t, block[4 * 4]);
>  
> -    partialButterfly4(src, coef, shift_1st, 4);
> -    partialButterfly4(coef, dst, shift_2nd, 4);
> +    for (int i = 0; i < 4; i++)
> +    {
> +        memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
> +    }
> +
> +    partialButterfly4(block, coef, shift_1st, 4);
> +    partialButterfly4(coef, block, shift_2nd, 4);
> +#define N (4)
> +    for (int i = 0; i < N; i++)
> +    {
> +        for (int j = 0; j < N; j++)
> +        {
> +            dst[i * N + j] = block[i * N + j];
> +        }
> +    }
> +
> +#undef N
>  }
>  
> -void dct8_c(const int16_t *src, int16_t *dst, intptr_t /* stride */)
> +void dct8_c(const int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      const int shift_1st = 2 + X265_DEPTH - 8;
>      const int shift_2nd = 9;
>  
>      ALIGN_VAR_32(int16_t, coef[8 * 8]);
> +    ALIGN_VAR_32(int16_t, block[8 * 8]);
>  
> -    partialButterfly8(src, coef, shift_1st, 8);
> -    partialButterfly8(coef, dst, shift_2nd, 8);
> +    for (int i = 0; i < 8; i++)
> +    {
> +        memcpy(&block[i * 8], &src[i * stride], 8 * sizeof(int16_t));
> +    }
> +
> +    partialButterfly8(block, coef, shift_1st, 8);
> +    partialButterfly8(coef, block, shift_2nd, 8);
> +
> +#define N (8)
> +    for (int i = 0; i < N; i++)
> +    {
> +        for (int j = 0; j < N; j++)
> +        {
> +            dst[i * N + j] = block[i * N + j];
> +        }
> +    }
> +
> +#undef N
>  }
>  
> -void dct16_c(const int16_t *src, int16_t *dst, intptr_t /* stride */)
> +void dct16_c(const int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      const int shift_1st = 3 + X265_DEPTH - 8;
>      const int shift_2nd = 10;
>  
>      ALIGN_VAR_32(int16_t, coef[16 * 16]);
> +    ALIGN_VAR_32(int16_t, block[16 * 16]);
>  
> -    partialButterfly16(src, coef, shift_1st, 16);
> -    partialButterfly16(coef, dst, shift_2nd, 16);
> +    for (int i = 0; i < 16; i++)
> +    {
> +        memcpy(&block[i * 16], &src[i * stride], 16 * sizeof(int16_t));
> +    }
> +
> +    partialButterfly16(block, coef, shift_1st, 16);
> +    partialButterfly16(coef, block, shift_2nd, 16);
> +
> +#define N (16)
> +    for (int i = 0; i < N; i++)
> +    {
> +        for (int j = 0; j < N; j++)
> +        {
> +            dst[i * N + j] = block[i * N + j];
> +        }
> +    }
> +
> +#undef N
>  }
>  
> -void dct32_c(const int16_t *src, int16_t *dst, intptr_t /* stride */)
> +void dct32_c(const int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      const int shift_1st = 4 + X265_DEPTH - 8;
>      const int shift_2nd = 11;
>  
>      ALIGN_VAR_32(int16_t, coef[32 * 32]);
> +    ALIGN_VAR_32(int16_t, block[32 * 32]);
>  
> -    partialButterfly32(src, coef, shift_1st, 32);
> -    partialButterfly32(coef, dst, shift_2nd, 32);
> +    for (int i = 0; i < 32; i++)
> +    {
> +        memcpy(&block[i * 32], &src[i * stride], 32 * sizeof(int16_t));
> +    }
> +
> +    partialButterfly32(block, coef, shift_1st, 32);
> +    partialButterfly32(coef, block, shift_2nd, 32);
> +
> +#define N (32)
> +    for (int i = 0; i < N; i++)
> +    {
> +        for (int j = 0; j < N; j++)
> +        {
> +            dst[i * N + j] = block[i * N + j];
> +        }
> +    }
> +
> +#undef N
>  }
>  
>  void idst4_c(const int16_t *src, int16_t *dst, intptr_t stride)
> @@ -509,7 +587,18 @@
>      ALIGN_VAR_32(int16_t, coef[4 * 4]);
>      ALIGN_VAR_32(int16_t, block[4 * 4]);
>  
> -    inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
> +#define N (4)
> +    for (int i = 0; i < N; i++)
> +    {
> +        for (int j = 0; j < N; j++)
> +        {
> +            block[i * N + j] = (int16_t)src[i * N + j];
> +        }
> +    }
> +
> +#undef N
> +
> +    inversedst(block, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
>      inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output
>  
>      for (int i = 0; i < 4; i++)
> @@ -526,7 +615,18 @@
>      ALIGN_VAR_32(int16_t, coef[4 * 4]);
>      ALIGN_VAR_32(int16_t, block[4 * 4]);
>  
> -    partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
> +#define N (4)
> +    for (int i = 0; i < N; i++)
> +    {
> +        for (int j = 0; j < N; j++)
> +        {
> +            block[i * N + j] = (int16_t)src[i * N + j];
> +        }
> +    }
> +
> +#undef N
> +
> +    partialButterflyInverse4(block, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
>      partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
>  
>      for (int i = 0; i < 4; i++)
> @@ -543,7 +643,18 @@
>      ALIGN_VAR_32(int16_t, coef[8 * 8]);
>      ALIGN_VAR_32(int16_t, block[8 * 8]);
>  
> -    partialButterflyInverse8(src, coef, shift_1st, 8);
> +#define N (8)
> +    for (int i = 0; i < N; i++)
> +    {
> +        for (int j = 0; j < N; j++)
> +        {
> +            block[i * N + j] = (int16_t)src[i * N + j];
> +        }
> +    }
> +
> +#undef N
> +
> +    partialButterflyInverse8(block, coef, shift_1st, 8);
>      partialButterflyInverse8(coef, block, shift_2nd, 8);
>      for (int i = 0; i < 8; i++)
>      {
> @@ -559,7 +670,18 @@
>      ALIGN_VAR_32(int16_t, coef[16 * 16]);
>      ALIGN_VAR_32(int16_t, block[16 * 16]);
>  
> -    partialButterflyInverse16(src, coef, shift_1st, 16);
> +#define N (16)
> +    for (int i = 0; i < N; i++)
> +    {
> +        for (int j = 0; j < N; j++)
> +        {
> +            block[i * N + j] = (int16_t)src[i * N + j];
> +        }
> +    }
> +
> +#undef N
> +
> +    partialButterflyInverse16(block, coef, shift_1st, 16);
>      partialButterflyInverse16(coef, block, shift_2nd, 16);
>      for (int i = 0; i < 16; i++)
>      {
> @@ -575,7 +697,18 @@
>      ALIGN_VAR_32(int16_t, coef[32 * 32]);
>      ALIGN_VAR_32(int16_t, block[32 * 32]);
>  
> -    partialButterflyInverse32(src, coef, shift_1st, 32);
> +#define N (32)
> +    for (int i = 0; i < N; i++)
> +    {
> +        for (int j = 0; j < N; j++)
> +        {
> +            block[i * N + j] = (int16_t)src[i * N + j];
> +        }
> +    }
> +
> +#undef N
> +
> +    partialButterflyInverse32(block, coef, shift_1st, 32);
>      partialButterflyInverse32(coef, block, shift_2nd, 32);
>  
>      for (int i = 0; i < 32; i++)
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho