[x265] [PATCH] refactorizaton of the transform/quant path

Tue Nov 18 18:55:15 CET 2014

On 11/18, praveen at multicorewareinc.com wrote:
> # HG changeset patch
> # User Praveen Tiwari
> # Date 1416299427 -19800
> # Node ID 706fa4af912bc1610478de8f09a651ae3e58624c
> # Parent  2f0062f0791b822fa932712a56e6b0a14e976d91
> refactorizaton of the transform/quant path.

Queued with white-space reflowing of the commit message

> This patch involves scaling down the DCT/IDCT coefficients from int32_t to int16_t
> as they can be accommodated on int16_t without any introduction of encode error,
> this allows us to clean up lots of DCT/IDCT intermediated buffers, optimize enode efficiency for different
> cli options including noise reduction by reducing data movement operations, accommodating more number of
> coefficients in a single register for SIMD operations. This patch include all necessary
> changes for the transfor/quant path including unit test code.
> 
> diff -r 2f0062f0791b -r 706fa4af912b source/common/dct.cpp
> --- a/source/common/dct.cpp	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/dct.cpp	Tue Nov 18 14:00:27 2014 +0530
> @@ -440,7 +440,7 @@
>      }
>  }
>  
> -void dst4_c(int16_t *src, int32_t *dst, intptr_t stride)
> +void dst4_c(int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      const int shift_1st = 1 + X265_DEPTH - 8;
>      const int shift_2nd = 8;
> @@ -454,132 +454,58 @@
>      }
>  
>      fastForwardDst(block, coef, shift_1st);
> -    fastForwardDst(coef, block, shift_2nd);
> -
> -#define N (4)
> -    for (int i = 0; i < N; i++)
> -    {
> -        for (int j = 0; j < N; j++)
> -        {
> -            dst[i * N + j] = block[i * N + j];
> -        }
> -    }
> -
> -#undef N
> +    fastForwardDst(coef, dst, shift_2nd);
>  }
>  
> -void dct4_c(int16_t *src, int32_t *dst, intptr_t stride)
> +void dct4_c(int16_t *src, int16_t *dst, intptr_t stride)
>  {
> +    stride; // To eliminate warnings and match the interface with asm code.
>      const int shift_1st = 1 + X265_DEPTH - 8;
>      const int shift_2nd = 8;
>  
>      ALIGN_VAR_32(int16_t, coef[4 * 4]);
> -    ALIGN_VAR_32(int16_t, block[4 * 4]);
>  
> -    for (int i = 0; i < 4; i++)
> -    {
> -        memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
> -    }
> -
> -    partialButterfly4(block, coef, shift_1st, 4);
> -    partialButterfly4(coef, block, shift_2nd, 4);
> -#define N (4)
> -    for (int i = 0; i < N; i++)
> -    {
> -        for (int j = 0; j < N; j++)
> -        {
> -            dst[i * N + j] = block[i * N + j];
> -        }
> -    }
> -
> -#undef N
> +    partialButterfly4(src, coef, shift_1st, 4);
> +    partialButterfly4(coef, dst, shift_2nd, 4);
>  }
>  
> -void dct8_c(int16_t *src, int32_t *dst, intptr_t stride)
> +void dct8_c(int16_t *src, int16_t *dst, intptr_t stride)
>  {
> +    stride; // To eliminate warnings and match the interface with asm code.
>      const int shift_1st = 2 + X265_DEPTH - 8;
>      const int shift_2nd = 9;
>  
>      ALIGN_VAR_32(int16_t, coef[8 * 8]);
> -    ALIGN_VAR_32(int16_t, block[8 * 8]);
>  
> -    for (int i = 0; i < 8; i++)
> -    {
> -        memcpy(&block[i * 8], &src[i * stride], 8 * sizeof(int16_t));
> -    }
> -
> -    partialButterfly8(block, coef, shift_1st, 8);
> -    partialButterfly8(coef, block, shift_2nd, 8);
> -
> -#define N (8)
> -    for (int i = 0; i < N; i++)
> -    {
> -        for (int j = 0; j < N; j++)
> -        {
> -            dst[i * N + j] = block[i * N + j];
> -        }
> -    }
> -
> -#undef N
> +    partialButterfly8(src, coef, shift_1st, 8);
> +    partialButterfly8(coef, dst, shift_2nd, 8);
>  }
>  
> -void dct16_c(int16_t *src, int32_t *dst, intptr_t stride)
> +void dct16_c(int16_t *src, int16_t *dst, intptr_t stride)
>  {
> +    stride; // To eliminate warnings and match the interface with asm code.
>      const int shift_1st = 3 + X265_DEPTH - 8;
>      const int shift_2nd = 10;
>  
>      ALIGN_VAR_32(int16_t, coef[16 * 16]);
> -    ALIGN_VAR_32(int16_t, block[16 * 16]);
>  
> -    for (int i = 0; i < 16; i++)
> -    {
> -        memcpy(&block[i * 16], &src[i * stride], 16 * sizeof(int16_t));
> -    }
> -
> -    partialButterfly16(block, coef, shift_1st, 16);
> -    partialButterfly16(coef, block, shift_2nd, 16);
> -
> -#define N (16)
> -    for (int i = 0; i < N; i++)
> -    {
> -        for (int j = 0; j < N; j++)
> -        {
> -            dst[i * N + j] = block[i * N + j];
> -        }
> -    }
> -
> -#undef N
> +    partialButterfly16(src, coef, shift_1st, 16);
> +    partialButterfly16(coef, dst, shift_2nd, 16);
>  }
>  
> -void dct32_c(int16_t *src, int32_t *dst, intptr_t stride)
> +void dct32_c(int16_t *src, int16_t *dst, intptr_t stride)
>  {
> +    stride; // To eliminate warnings and match the interface with asm code.
>      const int shift_1st = 4 + X265_DEPTH - 8;
>      const int shift_2nd = 11;
>  
>      ALIGN_VAR_32(int16_t, coef[32 * 32]);
> -    ALIGN_VAR_32(int16_t, block[32 * 32]);
>  
> -    for (int i = 0; i < 32; i++)
> -    {
> -        memcpy(&block[i * 32], &src[i * stride], 32 * sizeof(int16_t));
> -    }
> -
> -    partialButterfly32(block, coef, shift_1st, 32);
> -    partialButterfly32(coef, block, shift_2nd, 32);
> -
> -#define N (32)
> -    for (int i = 0; i < N; i++)
> -    {
> -        for (int j = 0; j < N; j++)
> -        {
> -            dst[i * N + j] = block[i * N + j];
> -        }
> -    }
> -
> -#undef N
> +    partialButterfly32(src, coef, shift_1st, 32);
> +    partialButterfly32(coef, dst, shift_2nd, 32);
>  }
>  
> -void idst4_c(int32_t *src, int16_t *dst, intptr_t stride)
> +void idst4_c(int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      const int shift_1st = 7;
>      const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -587,18 +513,7 @@
>      ALIGN_VAR_32(int16_t, coef[4 * 4]);
>      ALIGN_VAR_32(int16_t, block[4 * 4]);
>  
> -#define N (4)
> -    for (int i = 0; i < N; i++)
> -    {
> -        for (int j = 0; j < N; j++)
> -        {
> -            block[i * N + j] = (int16_t)src[i * N + j];
> -        }
> -    }
> -
> -#undef N
> -
> -    inversedst(block, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
> +    inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
>      inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output
>  
>      for (int i = 0; i < 4; i++)
> @@ -607,7 +522,7 @@
>      }
>  }
>  
> -void idct4_c(int32_t *src, int16_t *dst, intptr_t stride)
> +void idct4_c(int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      const int shift_1st = 7;
>      const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -615,18 +530,7 @@
>      ALIGN_VAR_32(int16_t, coef[4 * 4]);
>      ALIGN_VAR_32(int16_t, block[4 * 4]);
>  
> -#define N (4)
> -    for (int i = 0; i < N; i++)
> -    {
> -        for (int j = 0; j < N; j++)
> -        {
> -            block[i * N + j] = (int16_t)src[i * N + j];
> -        }
> -    }
> -
> -#undef N
> -
> -    partialButterflyInverse4(block, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
> +    partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
>      partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
>  
>      for (int i = 0; i < 4; i++)
> @@ -635,7 +539,7 @@
>      }
>  }
>  
> -void idct8_c(int32_t *src, int16_t *dst, intptr_t stride)
> +void idct8_c(int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      const int shift_1st = 7;
>      const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -643,18 +547,7 @@
>      ALIGN_VAR_32(int16_t, coef[8 * 8]);
>      ALIGN_VAR_32(int16_t, block[8 * 8]);
>  
> -#define N (8)
> -    for (int i = 0; i < N; i++)
> -    {
> -        for (int j = 0; j < N; j++)
> -        {
> -            block[i * N + j] = (int16_t)src[i * N + j];
> -        }
> -    }
> -
> -#undef N
> -
> -    partialButterflyInverse8(block, coef, shift_1st, 8);
> +    partialButterflyInverse8(src, coef, shift_1st, 8);
>      partialButterflyInverse8(coef, block, shift_2nd, 8);
>      for (int i = 0; i < 8; i++)
>      {
> @@ -662,7 +555,7 @@
>      }
>  }
>  
> -void idct16_c(int32_t *src, int16_t *dst, intptr_t stride)
> +void idct16_c(int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      const int shift_1st = 7;
>      const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -670,18 +563,7 @@
>      ALIGN_VAR_32(int16_t, coef[16 * 16]);
>      ALIGN_VAR_32(int16_t, block[16 * 16]);
>  
> -#define N (16)
> -    for (int i = 0; i < N; i++)
> -    {
> -        for (int j = 0; j < N; j++)
> -        {
> -            block[i * N + j] = (int16_t)src[i * N + j];
> -        }
> -    }
> -
> -#undef N
> -
> -    partialButterflyInverse16(block, coef, shift_1st, 16);
> +    partialButterflyInverse16(src, coef, shift_1st, 16);
>      partialButterflyInverse16(coef, block, shift_2nd, 16);
>      for (int i = 0; i < 16; i++)
>      {
> @@ -689,7 +571,7 @@
>      }
>  }
>  
> -void idct32_c(int32_t *src, int16_t *dst, intptr_t stride)
> +void idct32_c(int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      const int shift_1st = 7;
>      const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -697,18 +579,7 @@
>      ALIGN_VAR_32(int16_t, coef[32 * 32]);
>      ALIGN_VAR_32(int16_t, block[32 * 32]);
>  
> -#define N (32)
> -    for (int i = 0; i < N; i++)
> -    {
> -        for (int j = 0; j < N; j++)
> -        {
> -            block[i * N + j] = (int16_t)src[i * N + j];
> -        }
> -    }
> -
> -#undef N
> -
> -    partialButterflyInverse32(block, coef, shift_1st, 32);
> +    partialButterflyInverse32(src, coef, shift_1st, 32);
>      partialButterflyInverse32(coef, block, shift_2nd, 32);
>  
>      for (int i = 0; i < 32; i++)
> @@ -717,7 +588,7 @@
>      }
>  }
>  
> -void dequant_normal_c(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
> +void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
>  {
>  #if HIGH_BIT_DEPTH
>      X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > 2), "dequant invalid scale %d\n", scale);
> @@ -737,11 +608,11 @@
>      for (int n = 0; n < num; n++)
>      {
>          coeffQ = (quantCoef[n] * scale + add) >> shift;
> -        coef[n] = Clip3(-32768, 32767, coeffQ);
> +        coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ);
>      }
>  }
>  
> -void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
> +void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int16_t* coef, int num, int per, int shift)
>  {
>      X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
>  
> @@ -756,7 +627,7 @@
>          for (int n = 0; n < num; n++)
>          {
>              coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
> -            coef[n] = Clip3(-32768, 32767, coeffQ);
> +            coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ);
>          }
>      }
>      else
> @@ -764,12 +635,12 @@
>          for (int n = 0; n < num; n++)
>          {
>              coeffQ   = Clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
> -            coef[n] = Clip3(-32768, 32767, coeffQ << (per - shift));
> +            coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ << (per - shift));
>          }
>      }
>  }
>  
> -uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
> +uint32_t quant_c(int16_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
>  {
>      X265_CHECK(qBits >= 8, "qBits less than 8\n");
>      X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
> @@ -793,7 +664,7 @@
>      return numSig;
>  }
>  
> -uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
> +uint32_t nquant_c(int16_t* coef, int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
>  {
>      X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
>      X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
> @@ -848,7 +719,7 @@
>      return numSig;
>  }
>  
> -void denoiseDct_c(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
> +void denoiseDct_c(int16_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
>  {
>      for (int i = 0; i < numCoeff; i++)
>      {
> @@ -857,7 +728,7 @@
>          level = (level + sign) ^ sign;
>          resSum[i] += level;
>          level -= offset[i];
> -        dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign;
> +        dctCoef[i] = (int16_t)(level < 0 ? 0 : (level ^ sign) - sign);
>      }
>  }
>  
> diff -r 2f0062f0791b -r 706fa4af912b source/common/pixel.cpp
> --- a/source/common/pixel.cpp	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/pixel.cpp	Tue Nov 18 14:00:27 2014 +0530
> @@ -491,13 +491,13 @@
>      }
>  }
>  
> -void convert16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size)
> +void copy16to16_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size)
>  {
>      for (int i = 0; i < size; i++)
>      {
>          for (int j = 0; j < size; j++)
>          {
> -            dst[i * size + j] = ((int)src[i * stride + j]) << shift;
> +            dst[i * size + j] = (src[i * stride + j]) << shift;
>          }
>      }
>  }
> @@ -514,22 +514,6 @@
>      }
>  }
>  
> -void convert32to16_shr(int16_t *dst, int32_t *src, intptr_t stride, int shift, int size)
> -{
> -    int round = 1 << (shift - 1);
> -
> -    for (int i = 0; i < size; i++)
> -    {
> -        for (int j = 0; j < size; j++)
> -        {
> -            dst[j] = (int16_t)((src[j] + round) >> shift);
> -        }
> -
> -        src += size;
> -        dst += stride;
> -    }
> -}
> -
>  void copy_shr(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size)
>  {
>      int round = 1 << (shift - 1);
> @@ -1288,12 +1272,11 @@
>      p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
>      p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
>  
> -    p.cvt16to32_shl = convert16to32_shl;
> +    p.cpy16to16_shl = copy16to16_shl;
>      p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>;
>      p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>;
>      p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>;
>      p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>;
> -    p.cvt32to16_shr = convert32to16_shr;
>      p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>;
>      p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>;
>      p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>;
> diff -r 2f0062f0791b -r 706fa4af912b source/common/primitives.h
> --- a/source/common/primitives.h	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/primitives.h	Tue Nov 18 14:00:27 2014 +0530
> @@ -147,24 +147,23 @@
>  typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter);
>  typedef void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma);
>  
> -typedef void (*cvt16to32_shl_t)(int32_t *dst, int16_t *src, intptr_t, int, int);
> +typedef void (*cpy16to16_shl_t)(int16_t *dst, int16_t *src, intptr_t, int, int);
>  typedef void (*cvt16to32_shr_t)(int32_t *dst, int16_t *src, intptr_t, int, int);
> -typedef void (*cvt32to16_shr_t)(int16_t *dst, int32_t *src, intptr_t, int, int);
>  typedef void (*cvt32to16_shl_t)(int16_t *dst, int32_t *src, intptr_t, int);
>  typedef uint32_t (*copy_cnt_t)(int16_t* coeff, int16_t* residual, intptr_t stride);
>  typedef void (*copy_shr_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
>  typedef void (*copy_shl_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift);
>  
> -typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
> -typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
> -typedef void (*denoiseDct_t)(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff);
> +typedef void (*dct_t)(int16_t *src, int16_t *dst, intptr_t stride);
> +typedef void (*idct_t)(int16_t *src, int16_t *dst, intptr_t stride);
> +typedef void (*denoiseDct_t)(int16_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff);
>  
>  typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
>  typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
> -typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> -typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> -typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
> -typedef void (*dequant_normal_t)(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
> +typedef uint32_t (*quant_t)(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> +typedef uint32_t (*nquant_t)(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> +typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t *dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
> +typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
>  typedef int  (*count_nonzero_t)(const int16_t *quantCoeff, int numCoeff);
>  
>  typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
> @@ -220,9 +219,8 @@
>      pixelcmp_ss_t   psy_cost_ss[NUM_SQUARE_BLOCKS];
>  
>      blockfill_s_t   blockfill_s[NUM_SQUARE_BLOCKS];  // block fill with value
> -    cvt16to32_shl_t cvt16to32_shl;
> +    cpy16to16_shl_t cpy16to16_shl;
>      cvt16to32_shr_t cvt16to32_shr[NUM_SQUARE_BLOCKS - 1];
> -    cvt32to16_shr_t cvt32to16_shr;
>      cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1];
>      copy_cnt_t      copy_cnt[NUM_SQUARE_BLOCKS - 1];
>      copy_shr_t      copy_shr;
> diff -r 2f0062f0791b -r 706fa4af912b source/common/quant.cpp
> --- a/source/common/quant.cpp	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/quant.cpp	Tue Nov 18 14:00:27 2014 +0530
> @@ -166,7 +166,7 @@
>      m_useRDOQ = useRDOQ;
>      m_psyRdoqScale = (int64_t)(psyScale * 256.0);
>      m_scalingList = &scalingList;
> -    m_resiDctCoeff = X265_MALLOC(int32_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
> +    m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
>      m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE);
>      m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE);
>  
> @@ -340,7 +340,7 @@
>      if (useTransformSkip)
>      {
>  #if X265_DEPTH <= 10
> -        primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
> +        primitives.cpy16to16_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
>  #else
>          if (transformShift >= 0)
>              primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
> @@ -441,10 +441,10 @@
>          int trSize = 1 << log2TrSize;
>  
>  #if X265_DEPTH <= 10
> -        primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
> +        primitives.copy_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
>  #else
>          if (transformShift > 0)
> -            primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
> +            primitives.copy_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
>          else
>              primitives.cvt32to16_shl[log2TrSize - 2](residual, m_resiDctCoeff, stride, -transformShift);
>  #endif
> diff -r 2f0062f0791b -r 706fa4af912b source/common/quant.h
> --- a/source/common/quant.h	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/quant.h	Tue Nov 18 14:00:27 2014 +0530
> @@ -83,8 +83,8 @@
>  
>      bool               m_useRDOQ;
>      int64_t            m_psyRdoqScale;
> -    int32_t*           m_resiDctCoeff;
> -    int32_t*           m_fencDctCoeff;
> +    int16_t*           m_resiDctCoeff;
> +    int16_t*           m_fencDctCoeff;
>      int16_t*           m_fencShortBuf;
>  
>      enum { IEP_RATE = 32768 }; /* FIX15 cost of an equal probable bit */
> diff -r 2f0062f0791b -r 706fa4af912b source/common/vec/dct-sse3.cpp
> --- a/source/common/vec/dct-sse3.cpp	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/vec/dct-sse3.cpp	Tue Nov 18 14:00:27 2014 +0530
> @@ -52,7 +52,7 @@
>      {  83,  36,  83,  36, 83,  36, 83,  36 },
>      {  36, -83,  36, -83, 36, -83, 36, -83 }
>  };
> -void idct8(int32_t *src, int16_t *dst, intptr_t stride)
> +void idct8(int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
>      __m128i T00, T01, T02, T03, T04, T05, T06, T07;
> @@ -305,7 +305,7 @@
>      _mm_storeh_pi((__m64*)&dst[7 * stride +  4], _mm_castsi128_ps(T11));
>  }
>  
> -void idct16(int32_t *src, int16_t *dst, intptr_t stride)
> +void idct16(int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      const __m128i c16_p87_p90   = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
>      const __m128i c16_p70_p80   = _mm_set1_epi32(0x00460050);
> @@ -367,71 +367,22 @@
>      for (int i = 0; i < 2; i++)
>      {
>          const int offset = (i << 3);
> -        __m128i T00, T01;
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset + 4]);
> -        in00[i]  = _mm_packs_epi32(T00, T01);                       // [07 06 05 04 03 02 01 00]
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset + 4]);
> -        in01[i]  = _mm_packs_epi32(T00, T01);                           // [17 16 15 14 13 12 11 10]
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset + 4]);
> -        in02[i]  = _mm_packs_epi32(T00, T01);                       // [27 26 25 24 23 22 21 20]
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset + 4]);
> -        in03[i]  = _mm_packs_epi32(T00, T01);                       // [37 36 35 34 33 32 31 30]
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset + 4]);
> -        in04[i]  = _mm_packs_epi32(T00, T01);                       // [47 46 45 44 43 42 41 40]
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset + 4]);
> -        in05[i]  = _mm_packs_epi32(T00, T01);                       // [57 56 55 54 53 52 51 50]
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset + 4]);
> -        in06[i]  = _mm_packs_epi32(T00, T01);                       // [67 66 65 64 63 62 61 60]
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset + 4]);
> -        in07[i]  = _mm_packs_epi32(T00, T01);                       // [77 76 75 74 73 72 71 70]
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset + 4]);
> -        in08[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset + 4]);
> -        in09[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset + 4]);
> -        in10[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset + 4]);
> -        in11[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset + 4]);
> -        in12[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset + 4]);
> -        in13[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset + 4]);
> -        in14[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset + 4]);
> -        in15[i]  = _mm_packs_epi32(T00, T01);
> +        in00[i]  = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]); // [07 06 05 04 03 02 01 00]
> +        in01[i]  = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]); // [17 16 15 14 13 12 11 10]
> +        in02[i]  = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]); // [27 26 25 24 23 22 21 20]
> +        in03[i]  = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]); // [37 36 35 34 33 32 31 30]
> +        in04[i]  = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]); // [47 46 45 44 43 42 41 40]
> +        in05[i]  = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]); // [57 56 55 54 53 52 51 50]
> +        in06[i]  = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]); // [67 66 65 64 63 62 61 60]
> +        in07[i]  = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]); // [77 76 75 74 73 72 71 70]
> +        in08[i]  = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);
> +        in09[i]  = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);
> +        in10[i]  = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]);
> +        in11[i]  = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]);
> +        in12[i]  = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]);
> +        in13[i]  = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]);
> +        in14[i]  = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]);
> +        in15[i]  = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]);
>      }
>  
>      for (int pass = 0; pass < 2; pass++)
> @@ -716,7 +667,7 @@
>      _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
>  }
>  
> -void idct32(int32_t *src, int16_t *dst, intptr_t stride)
> +void idct32(int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      //Odd
>      const __m128i c16_p90_p90   = _mm_set1_epi32(0x005A005A); //column 0
> @@ -909,135 +860,38 @@
>      for (int i = 0; i < 4; i++)
>      {
>          const int offset = (i << 3);
> -        __m128i T00, T01;
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset + 4]);
> -        in00[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset + 4]);
> -        in01[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset + 4]);
> -        in02[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset + 4]);
> -        in03[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset + 4]);
> -        in04[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset + 4]);
> -        in05[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset + 4]);
> -        in06[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset + 4]);
> -        in07[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset + 4]);
> -        in08[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset + 4]);
> -        in09[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset + 4]);
> -        in10[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset + 4]);
> -        in11[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset + 4]);
> -        in12[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset + 4]);
> -        in13[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset + 4]);
> -        in14[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset + 4]);
> -        in15[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset + 4]);
> -        in16[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset + 4]);
> -        in17[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset + 4]);
> -        in18[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset + 4]);
> -        in19[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset + 4]);
> -        in20[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset + 4]);
> -        in21[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset + 4]);
> -        in22[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset + 4]);
> -        in23[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset + 4]);
> -        in24[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset + 4]);
> -        in25[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset + 4]);
> -        in26[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset + 4]);
> -        in27[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset + 4]);
> -        in28[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset + 4]);
> -        in29[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset + 4]);
> -        in30[i]  = _mm_packs_epi32(T00, T01);
> -
> -        T00 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset + 4]);
> -        in31[i]  = _mm_packs_epi32(T00, T01);
> +        in00[i]  = _mm_loadu_si128((const __m128i*)&src[0  * 32 + offset]);
> +        in01[i]  = _mm_loadu_si128((const __m128i*)&src[1  * 32 + offset]);
> +        in02[i]  = _mm_loadu_si128((const __m128i*)&src[2  * 32 + offset]);
> +        in03[i]  = _mm_loadu_si128((const __m128i*)&src[3  * 32 + offset]);
> +        in04[i]  = _mm_loadu_si128((const __m128i*)&src[4  * 32 + offset]);
> +        in05[i]  = _mm_loadu_si128((const __m128i*)&src[5  * 32 + offset]);
> +        in06[i]  = _mm_loadu_si128((const __m128i*)&src[6  * 32 + offset]);
> +        in07[i]  = _mm_loadu_si128((const __m128i*)&src[7  * 32 + offset]);
> +        in08[i]  = _mm_loadu_si128((const __m128i*)&src[8  * 32 + offset]);
> +        in09[i]  = _mm_loadu_si128((const __m128i*)&src[9  * 32 + offset]);
> +        in10[i]  = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
> +        in11[i]  = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
> +        in12[i]  = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
> +        in13[i]  = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
> +        in14[i]  = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
> +        in15[i]  = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
> +        in16[i]  = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
> +        in17[i]  = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
> +        in18[i]  = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
> +        in19[i]  = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
> +        in20[i]  = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
> +        in21[i]  = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
> +        in22[i]  = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
> +        in23[i]  = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
> +        in24[i]  = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
> +        in25[i]  = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
> +        in26[i]  = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
> +        in27[i]  = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
> +        in28[i]  = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
> +        in29[i]  = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
> +        in30[i]  = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
> +        in31[i]  = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
>      }
>  
>      for (int pass = 0; pass < 2; pass++)
> @@ -1564,7 +1418,7 @@
>       * still somewhat rare on end-user PCs we still compile and link these SSE3
>       * intrinsic SIMD functions */
>  #if !HIGH_BIT_DEPTH
> -    p.idct[IDCT_8x8] = idct8;
> +//    p.idct[IDCT_8x8] = idct8;
>      p.idct[IDCT_16x16] = idct16;
>      p.idct[IDCT_32x32] = idct32;
>  #endif
> diff -r 2f0062f0791b -r 706fa4af912b source/common/vec/dct-sse41.cpp
> --- a/source/common/vec/dct-sse41.cpp	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/vec/dct-sse41.cpp	Tue Nov 18 14:00:27 2014 +0530
> @@ -36,7 +36,7 @@
>  using namespace x265;
>  
>  namespace {
> -void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
> +void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int16_t* coef, int num, int per, int shift)
>  {
>      X265_CHECK(num <= 32 * 32, "dequant num too large\n");
>  
> @@ -66,11 +66,7 @@
>              quantCoef2 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef2, deQuantCoef2), IAdd), _mm_cvtsi32_si128(shift - per));
>  
>              quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
> -            sign = _mm_srai_epi16(quantCoef12, 15);
> -            quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
> -            _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
> -            quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
> -            _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
> +            _mm_storeu_si128((__m128i*)(coef + n), quantCoef12);
>          }
>      }
>      else
> @@ -100,11 +96,7 @@
>              quantCoef2 = _mm_sll_epi32(quantCoef2, _mm_cvtsi32_si128(per - shift));
>  
>              quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
> -            sign = _mm_srai_epi16(quantCoef12, 15);
> -            quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
> -            _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
> -            quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
> -            _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
> +            _mm_storeu_si128((__m128i*)(coef + n), quantCoef12);
>          }
>      }
>  }
> diff -r 2f0062f0791b -r 706fa4af912b source/common/vec/dct-ssse3.cpp
> --- a/source/common/vec/dct-ssse3.cpp	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/vec/dct-ssse3.cpp	Tue Nov 18 14:00:27 2014 +0530
> @@ -100,7 +100,7 @@
>  #undef MAKE_COEF
>  };
>  
> -void dct16(int16_t *src, int32_t *dst, intptr_t stride)
> +void dct16(int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      // Const
>      __m128i c_4     = _mm_set1_epi32(4);
> @@ -344,8 +344,10 @@
>          T41  = _mm_hsub_epi32(T30, T31);
>          T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
>          T41  = _mm_srai_epi32(_mm_add_epi32(T41, c_512), 10);
> -        _mm_storeu_si128((__m128i*)&dst[0 * 16 + i], T40);
> -        _mm_storeu_si128((__m128i*)&dst[8 * 16 + i], T41);
> +        T40  = _mm_packs_epi32(T40, T40);
> +        T41  = _mm_packs_epi32(T41, T41);
> +        _mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40);
> +        _mm_storel_epi64((__m128i*)&dst[8 * 16 + i], T41);
>  
>          T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
>          T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
> @@ -366,7 +368,8 @@
>  
>          T40  = _mm_hadd_epi32(T30, T31);
>          T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
> -        _mm_storeu_si128((__m128i*)&dst[4 * 16 + i], T40);
> +        T40  = _mm_packs_epi32(T40, T40);
> +        _mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40);
>  
>          T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
>          T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
> @@ -387,7 +390,8 @@
>  
>          T40  = _mm_hadd_epi32(T30, T31);
>          T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
> -        _mm_storeu_si128((__m128i*)&dst[12 * 16 + i], T40);
> +        T40  = _mm_packs_epi32(T40, T40);
> +        _mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40);
>  
>          T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
>          T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
> @@ -408,7 +412,8 @@
>  
>          T40  = _mm_hadd_epi32(T30, T31);
>          T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
> -        _mm_storeu_si128((__m128i*)&dst[2 * 16 + i], T40);
> +        T40  = _mm_packs_epi32(T40, T40);
> +        _mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40);
>  
>          T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
>          T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
> @@ -429,7 +434,8 @@
>  
>          T40  = _mm_hadd_epi32(T30, T31);
>          T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
> -        _mm_storeu_si128((__m128i*)&dst[6 * 16 + i], T40);
> +        T40  = _mm_packs_epi32(T40, T40);
> +        _mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40);
>  
>          T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
>          T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
> @@ -450,7 +456,8 @@
>  
>          T40  = _mm_hadd_epi32(T30, T31);
>          T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
> -        _mm_storeu_si128((__m128i*)&dst[10 * 16 + i], T40);
> +        T40  = _mm_packs_epi32(T40, T40);
> +        _mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40);
>  
>          T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
>          T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
> @@ -471,7 +478,8 @@
>  
>          T40  = _mm_hadd_epi32(T30, T31);
>          T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
> -        _mm_storeu_si128((__m128i*)&dst[14 * 16 + i], T40);
> +        T40  = _mm_packs_epi32(T40, T40);
> +        _mm_storel_epi64((__m128i*)&dst[14 * 16 + i], T40);
>  
>  #define MAKE_ODD(tab, dstPos) \
>      T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)]));       /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \
> @@ -493,7 +501,8 @@
>          \
>      T40  = _mm_hadd_epi32(T30, T31); \
>      T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); \
> -    _mm_storeu_si128((__m128i*)&dst[(dstPos) * 16 + i], T40);
> +    T40  = _mm_packs_epi32(T40, T40); \
> +    _mm_storel_epi64((__m128i*)&dst[(dstPos) * 16 + i], T40);
>  
>          MAKE_ODD(14,  1);
>          MAKE_ODD(16,  3);
> @@ -657,7 +666,7 @@
>  #undef MAKE_COEF16
>  };
>  
> -void dct32(int16_t *src, int32_t *dst, intptr_t stride)
> +void dct32(int16_t *src, int16_t *dst, intptr_t stride)
>  {
>      // Const
>      __m128i c_8     = _mm_set1_epi32(8);
> @@ -1050,7 +1059,8 @@
>      T60  = _mm_hadd_epi32(T60, T61); \
>          \
>      T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), 11); \
> -    _mm_storeu_si128((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
> +    T60  = _mm_packs_epi32(T60, T60); \
> +    _mm_storel_epi64((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
>  
>          MAKE_ODD(44, 44, 44, 44,  0);
>          MAKE_ODD(45, 45, 45, 45, 16);
> diff -r 2f0062f0791b -r 706fa4af912b source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/x86/asm-primitives.cpp	Tue Nov 18 14:00:27 2014 +0530
> @@ -1336,7 +1336,6 @@
>          p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
>          p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
>  
> -        p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
>          p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
>          p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
>          p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
> @@ -1407,7 +1406,6 @@
>          p.quant = x265_quant_sse4;
>          p.nquant = x265_nquant_sse4;
>          p.dequant_normal = x265_dequant_normal_sse4;
> -        p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
>          p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
>          p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
>          p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
> @@ -1448,7 +1446,6 @@
>          p.idct[IDCT_8x8] = x265_idct8_avx2;
>          p.idct[IDCT_16x16] = x265_idct16_avx2;
>          p.idct[IDCT_32x32] = x265_idct32_avx2;
> -
>          p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
>          p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
>          p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
> @@ -1551,7 +1548,6 @@
>          p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
>          SA8D_INTER_FROM_BLOCK(sse2);
>  
> -        p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
>          p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
>          p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
>          p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
> @@ -1565,9 +1561,11 @@
>          p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
>          p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
>          p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
> +
>          p.dct[DCT_4x4] = x265_dct4_sse2;
>          p.idct[IDCT_4x4] = x265_idct4_sse2;
>          p.idct[IDST_4x4] = x265_idst4_sse2;
> +
>          p.planecopy_sp = x265_downShift_16_sse2;
>          p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
>          p.copy_shl[BLOCK_8x8] = x265_copy_shl_8_sse2;
> @@ -1616,7 +1614,7 @@
>          LUMA_ADDAVG(_sse4);
>          CHROMA_ADDAVG(_sse4);
>          CHROMA_ADDAVG_422(_sse4);
> -        p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
> +        p.cpy16to16_shl = x265_copy16to16_shl_sse4;
>          p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
>          p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
>          p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
> @@ -1765,11 +1763,13 @@
>          p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
>          p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
>          p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
> +
>          p.denoiseDct = x265_denoise_dct_avx2;
>          p.dct[DCT_4x4] = x265_dct4_avx2;
>          p.quant = x265_quant_avx2;
>          p.nquant = x265_nquant_avx2;
>          p.dequant_normal = x265_dequant_normal_avx2;
> +
>          p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x4] = x265_blockcopy_ss_16x4_avx;
>          p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x12] = x265_blockcopy_ss_16x12_avx;
>          p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x8] = x265_blockcopy_ss_16x8_avx;
> @@ -1785,6 +1785,7 @@
>          p.weight_pp = x265_weight_pp_avx2;
>  
>  #if X86_64
> +
>          p.dct[DCT_8x8] = x265_dct8_avx2;
>          p.dct[DCT_16x16] = x265_dct16_avx2;
>          p.dct[DCT_32x32] = x265_dct32_avx2;
> diff -r 2f0062f0791b -r 706fa4af912b source/common/x86/blockcopy8.asm
> --- a/source/common/x86/blockcopy8.asm	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/x86/blockcopy8.asm	Tue Nov 18 14:00:27 2014 +0530
> @@ -3669,85 +3669,11 @@
>  BLOCKCOPY_SS_W64_H4_avx 64, 48
>  BLOCKCOPY_SS_W64_H4_avx 64, 64
>  
> -;-----------------------------------------------------------------------------
> -; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
> -;-----------------------------------------------------------------------------
> -INIT_XMM sse2
> -cglobal cvt32to16_shr, 4, 7, 3, dst, src, stride
> -%define rnd     m2
> -%define shift   m1
> -
> -    ; make shift
> -    mov         r5d, r3m
> -    movd        shift, r5d
> -
> -    ; make round
> -    dec         r5
> -    xor         r6, r6
> -    bts         r6, r5
> -    
> -    movd        rnd, r6d
> -    pshufd      rnd, rnd, 0
> -
> -    ; register alloc
> -    ; r0 - dst
> -    ; r1 - src
> -    ; r2 - stride * 2 (short*)
> -    ; r3 - lx
> -    ; r4 - size
> -    ; r5 - ly
> -    ; r6 - diff
> -    add         r2d, r2d
> -
> -    mov         r4d, r4m
> -    mov         r5, r4
> -    mov         r6, r2
> -    sub         r6, r4
> -    add         r6, r6
> -
> -    shr         r5, 1
> -.loop_row:
> -
> -    mov         r3, r4
> -    shr         r3, 2
> -.loop_col:
> -    ; row 0
> -    movu        m0, [r1]
> -    paddd       m0, rnd
> -    psrad       m0, shift
> -    packssdw    m0, m0
> -    movh        [r0], m0
> -
> -    ; row 1
> -    movu        m0, [r1 + r4 * 4]
> -    paddd       m0, rnd
> -    psrad       m0, shift
> -    packssdw    m0, m0
> -    movh        [r0 + r2], m0
> -
> -    ; move col pointer
> -    add         r1, 16
> -    add         r0, 8
> -
> -    dec         r3
> -    jg          .loop_col
> -
> -    ; update pointer
> -    lea         r1, [r1 + r4 * 4]
> -    add         r0, r6
> -
> -    ; end of loop_row
> -    dec         r5
> -    jg         .loop_row
> -    
> -    RET
> -
> -
>  ;--------------------------------------------------------------------------------------
> -; void cvt16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size);
> +; void copy16to16_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
>  ;--------------------------------------------------------------------------------------
>  INIT_XMM sse4
> -cglobal cvt16to32_shl, 5, 7, 2, dst, src, stride, shift, size
> +cglobal copy16to16_shl, 5, 7, 2, dst, src, stride, shift, size
>  %define shift       m1
>  
>      ; make shift
> @@ -3764,16 +3690,16 @@
>      sub             r2d,      r4d
>      add             r2d,      r2d
>      mov             r5d,      r4d
> -    shr             r4d,      2
> +    shr             r4d,      3
>  .loop_row:
>      mov             r6d,      r4d
>  
>  .loop_col:
> -    pmovsxwd        m0,       [r1]
> -    pslld           m0,       shift
> +    movu            m0,       [r1]
> +    psllw           m0,       shift
>      movu            [r0],     m0
>  
> -    add             r1,       8
> +    add             r1,       16
>      add             r0,       16
>  
>      dec             r6d
> diff -r 2f0062f0791b -r 706fa4af912b source/common/x86/blockcopy8.h
> --- a/source/common/x86/blockcopy8.h	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/x86/blockcopy8.h	Tue Nov 18 14:00:27 2014 +0530
> @@ -24,7 +24,6 @@
>  #ifndef X265_BLOCKCOPY8_H
>  #define X265_BLOCKCOPY8_H
>  
> -void x265_cvt32to16_shr_sse2(int16_t * dst, int *src, intptr_t, int, int);
>  void x265_cvt32to16_shl_4_sse2(int16_t * dst, int *src, intptr_t, int);
>  void x265_cvt32to16_shl_8_sse2(int16_t * dst, int *src, intptr_t, int);
>  void x265_cvt32to16_shl_16_sse2(int16_t * dst, int *src, intptr_t, int);
> @@ -33,7 +32,7 @@
>  void x265_cvt32to16_shl_8_avx2(int16_t * dst, int *src, intptr_t, int);
>  void x265_cvt32to16_shl_16_avx2(int16_t * dst, int *src, intptr_t, int);
>  void x265_cvt32to16_shl_32_avx2(int16_t * dst, int *src, intptr_t, int);
> -void x265_cvt16to32_shl_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> +void x265_copy16to16_shl_sse4(int16_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
>  void x265_cvt16to32_shr_4_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
>  void x265_cvt16to32_shr_8_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
>  void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> diff -r 2f0062f0791b -r 706fa4af912b source/common/x86/dct8.asm
> --- a/source/common/x86/dct8.asm	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/x86/dct8.asm	Tue Nov 18 14:00:27 2014 +0530
> @@ -245,7 +245,7 @@
>  
>  avx2_idct4_2:   dw 64, 64, 64, -64, 83, 36, 36, -83
>  
> -const idct4_shuf1,    times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
> +const idct4_shuf1,    times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
>  
>  idct4_shuf2:    times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11
>  
> @@ -318,7 +318,7 @@
>  cextern pw_ppppmmmm
>  
>  ;------------------------------------------------------
> -;void dct4(int16_t *src, int32_t *dst, intptr_t stride)
> +;void dct4(int16_t *src, int16_t *dst, intptr_t stride)
>  ;------------------------------------------------------
>  INIT_XMM sse2
>  cglobal dct4, 3, 4, 8
> @@ -384,28 +384,28 @@
>      paddd       m1, m3
>      paddd       m1, m7
>      psrad       m1, 8
> +
> +    pmaddwd     m4, m2, m5
> +    pmaddwd     m3, m0, m5
> +    psubd       m4, m3
> +    paddd       m4, m7
> +    psrad       m4, 8
> +    packssdw    m1, m4
>      movu        [r1 + 0 * 16], m1
>  
> -    pmaddwd     m1, m2, m5
> -    pmaddwd     m3, m0, m5
> -    psubd       m1, m3
> -    paddd       m1, m7
> -    psrad       m1, 8
> -    movu        [r1 + 1 * 16], m1
> -
>      pmaddwd     m1, m2, m6
>      pmaddwd     m3, m0, m6
>      paddd       m1, m3
>      paddd       m1, m7
>      psrad       m1, 8
> -    movu        [r1 + 2 * 16], m1
>  
>      pmaddwd     m2, [r3 + 3 * 16]
>      pmaddwd     m0, [r3 + 3 * 16]
>      psubd       m2, m0
>      paddd       m2, m7
>      psrad       m2, 8
> -    movu        [r1 + 3 * 16], m2
> +    packssdw    m1, m2
> +    movu        [r1 + 1 * 16], m1
>      RET
>  
>  ; DCT 4x4
> @@ -470,14 +470,12 @@
>      paddd           m2, m7
>      psrad           m2, 8
>  
> -    movu            [r1], xm3
> -    movu            [r1 + mmsize/2], m2
> -    vextracti128    [r1 + mmsize], m3, 1
> -    vextracti128    [r1 + mmsize + mmsize/2], m2, 1
> +    packssdw        m3, m2
> +    movu            [r1], m3
>      RET
>  
>  ;-------------------------------------------------------
> -;void idct4(int32_t *src, int16_t *dst, intptr_t stride)
> +;void idct4(int16_t *src, int16_t *dst, intptr_t stride)
>  ;-------------------------------------------------------
>  INIT_XMM sse2
>  cglobal idct4, 3, 4, 7
> @@ -497,11 +495,6 @@
>  
>      movu        m0, [r0 + 0 * 16]
>      movu        m1, [r0 + 1 * 16]
> -    packssdw    m0, m1
> -
> -    movu        m1, [r0 + 2 * 16]
> -    movu        m2, [r0 + 3 * 16]
> -    packssdw    m1, m2
>  
>      punpcklwd   m2, m0, m1
>      pmaddwd     m3, m2, [r3 + 0 * 16]       ; m3 = E1
> @@ -572,7 +565,7 @@
>      RET
>  
>  ;------------------------------------------------------
> -;void dst4(int16_t *src, int32_t *dst, intptr_t stride)
> +;void dst4(int16_t *src, int16_t *dst, intptr_t stride)
>  ;------------------------------------------------------
>  INIT_XMM ssse3
>  %if ARCH_X86_64
> @@ -638,33 +631,33 @@
>      phaddd      m0, m1
>      paddd       m0, m5
>      psrad       m0, 8
> +
> +    pmaddwd     m4, m2, coef1
> +    pmaddwd     m1, m3, coef1
> +    phaddd      m4, m1
> +    paddd       m4, m5
> +    psrad       m4, 8
> +    packssdw    m0, m4
>      movu        [r1 + 0 * 16], m0
>  
> -    pmaddwd     m0, m2, coef1
> -    pmaddwd     m1, m3, coef1
> -    phaddd      m0, m1
> -    paddd       m0, m5
> -    psrad       m0, 8
> -    movu        [r1 + 1 * 16], m0
> -
>      pmaddwd     m0, m2, coef2
>      pmaddwd     m1, m3, coef2
>      phaddd      m0, m1
>      paddd       m0, m5
>      psrad       m0, 8
> -    movu        [r1 + 2 * 16], m0
>  
>      pmaddwd     m2, coef3
>      pmaddwd     m3, coef3
>      phaddd      m2, m3
>      paddd       m2, m5
>      psrad       m2, 8
> -    movu        [r1 + 3 * 16], m2
> +    packssdw    m0, m2
> +    movu        [r1 + 1 * 16], m0
>  
>      RET
>  
>  ;-------------------------------------------------------
> -;void idst4(int32_t *src, int16_t *dst, intptr_t stride)
> +;void idst4(int16_t *src, int16_t *dst, intptr_t stride)
>  ;-------------------------------------------------------
>  INIT_XMM sse2
>  cglobal idst4, 3, 4, 7
> @@ -683,11 +676,6 @@
>  
>      movu        m0, [r0 + 0 * 16]
>      movu        m1, [r0 + 1 * 16]
> -    packssdw    m0, m1
> -
> -    movu        m1, [r0 + 2 * 16]
> -    movu        m2, [r0 + 3 * 16]
> -    packssdw    m1, m2
>  
>      punpcklwd   m2, m0, m1                  ; m2 = m128iAC
>      punpckhwd   m0, m1                      ; m0 = m128iBD
> @@ -762,7 +750,7 @@
>  
>  
>  ;-------------------------------------------------------
> -; void dct8(int16_t *src, int32_t *dst, intptr_t stride)
> +; void dct8(int16_t *src, int16_t *dst, intptr_t stride)
>  ;-------------------------------------------------------
>  INIT_XMM sse4
>  cglobal dct8, 3,6,7,0-16*mmsize
> @@ -935,10 +923,16 @@
>      phsubd      m4, m2                  ; m4 = [Row6 Row4]
>      paddd       m4, m6
>      psrad       m4, 9
> -    movh        [r1 + 0*2*mmsize], m3
> -    movhps      [r1 + 2*2*mmsize], m3
> -    movh        [r1 + 4*2*mmsize], m4
> -    movhps      [r1 + 6*2*mmsize], m4
> +
> +    packssdw    m3, m3
> +    movd        [r1 + 0*mmsize], m3
> +    pshufd      m3, m3, 1
> +    movd        [r1 + 2*mmsize], m3
> +
> +    packssdw    m4, m4
> +    movd        [r1 + 4*mmsize], m4
> +    pshufd      m4, m4, 1
> +    movd        [r1 + 6*mmsize], m4
>  
>      ; odd
>      pmulld      m2, m0, [r4 + 2*16]
> @@ -950,8 +944,11 @@
>      phaddd      m2, m4                  ; m2 = [Row3 Row1]
>      paddd       m2, m6
>      psrad       m2, 9
> -    movh        [r1 + 1*2*mmsize], m2
> -    movhps      [r1 + 3*2*mmsize], m2
> +
> +    packssdw    m2, m2
> +    movd        [r1 + 1*mmsize], m2
> +    pshufd      m2, m2, 1
> +    movd        [r1 + 3*mmsize], m2
>  
>      pmulld      m2, m0, [r4 + 4*16]
>      pmulld      m3, m1, [r4 + 4*16]
> @@ -962,10 +959,13 @@
>      phaddd      m2, m4                  ; m2 = [Row7 Row5]
>      paddd       m2, m6
>      psrad       m2, 9
> -    movh        [r1 + 5*2*mmsize], m2
> -    movhps      [r1 + 7*2*mmsize], m2
> -
> -    add         r1, mmsize/2
> +
> +    packssdw    m2, m2
> +    movd        [r1 + 5*mmsize], m2
> +    pshufd      m2, m2, 1
> +    movd        [r1 + 7*mmsize], m2
> +
> +    add         r1, mmsize/4
>      add         r0, 2*2*mmsize
>  %endrep
>  
> @@ -974,17 +974,16 @@
>      RET
>  
>  ;-------------------------------------------------------
> -; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
> +; void idct8(int16_t *src, int16_t *dst, intptr_t stride)
>  ;-------------------------------------------------------
>  INIT_XMM ssse3
>  
>  cglobal patial_butterfly_inverse_internal_pass1
> -    movu        m0, [r0]
> -    movu        m1, [r0 + 4 * 32]
> -    movu        m2, [r0 + 2 * 32]
> -    movu        m3, [r0 + 6 * 32]
> -    packssdw    m0, m2
> -    packssdw    m1, m3
> +    movh        m0, [r0]
> +    movhps      m0, [r0 + 2 * 16]
> +    movh        m1, [r0 + 4 * 16]
> +    movhps      m1, [r0 + 6 * 16]
> +
>      punpckhwd   m2, m0, m1                  ; [2 6]
>      punpcklwd   m0, m1                      ; [0 4]
>      pmaddwd     m1, m0, [r6]                ; EE[0]
> @@ -1004,12 +1003,10 @@
>      paddd       m3, m5
>      paddd       m4, m5
>  
> -    movu        m2, [r0 + 32]
> -    movu        m5, [r0 + 5 * 32]
> -    packssdw    m2, m5
> -    movu        m5, [r0 + 3 * 32]
> -    movu        m6, [r0 + 7 * 32]
> -    packssdw    m5, m6
> +    movh        m2, [r0 + 16]
> +    movhps      m2, [r0 + 5 * 16]
> +    movh        m5, [r0 + 3 * 16]
> +    movhps      m5, [r0 + 7 * 16]
>      punpcklwd   m6, m2, m5                  ;[1 3]
>      punpckhwd   m2, m5                      ;[5 7]
>  
> @@ -1136,7 +1133,7 @@
>  
>      call        patial_butterfly_inverse_internal_pass1
>  
> -    add         r0, 16
> +    add         r0, 8
>      add         r5, 8
>  
>      call        patial_butterfly_inverse_internal_pass1
> @@ -1167,53 +1164,68 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void denoise_dct(int32_t *dct, uint32_t *sum, uint16_t *offset, int size)
> +; void denoise_dct(int16_t *dct, uint32_t *sum, uint16_t *offset, int size)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal denoise_dct, 4, 4, 6
>      pxor     m5,  m5
> -    shr      r3d, 2
> +    shr      r3d, 3
>  .loop:
>      mova     m0, [r0]
> -    pabsd    m1, m0
> +    pabsw    m1, m0
> +
>      mova     m2, [r1]
> -    paddd    m2, m1
> +    pmovsxwd m3, m1
> +    paddd    m2, m3
>      mova     [r1], m2
> -    pmovzxwd m3, [r2]
> -    psubd    m1, m3
> -    pcmpgtd  m4, m1, m5
> +    mova     m2, [r1 + 16]
> +    psrldq   m3, m1, 8
> +    pmovsxwd m4, m3
> +    paddd    m2, m4
> +    mova     [r1 + 16], m2
> +
> +    movu     m3, [r2]
> +    psubsw   m1, m3
> +    pcmpgtw  m4, m1, m5
>      pand     m1, m4
> -    psignd   m1, m0
> +    psignw   m1, m0
>      mova     [r0], m1
>      add      r0, 16
> -    add      r1, 16
> -    add      r2, 8
> -    dec      r3d
> -    jnz .loop
> -    RET
> -
> -INIT_YMM avx2
> -cglobal denoise_dct, 4, 4, 6
> -    pxor     m5,  m5
> -    shr      r3d, 3
> -.loop:
> -    movu     m0, [r0]
> -    pabsd    m1, m0
> -    movu     m2, [r1]
> -    paddd    m2, m1
> -    movu     [r1], m2
> -    pmovzxwd m3, [r2]
> -    psubd    m1, m3
> -    pcmpgtd  m4, m1, m5
> -    pand     m1, m4
> -    psignd   m1, m0
> -    movu     [r0], m1
> -    add      r0, 32
>      add      r1, 32
>      add      r2, 16
>      dec      r3d
>      jnz .loop
>      RET
> +
> +INIT_YMM avx2
> +cglobal denoise_dct, 4, 4, 6
> +    pxor     m5,  m5
> +    shr      r3d, 4
> +.loop:
> +    movu     m0, [r0]
> +    pabsw    m1, m0
> +    movu     m2, [r1]
> +    pmovsxwd m4, xm1
> +    paddd    m2, m4
> +    movu     [r1], m2
> +    vextracti128 xm4, m1, 1
> +    movu     m2, [r1 + 32]
> +    pmovsxwd m3, xm4
> +    paddd    m2, m3
> +    movu     [r1 + 32], m2
> +    movu     m3, [r2]
> +    psubw    m1, m3
> +    pcmpgtw  m4, m1, m5
> +    pand     m1, m4
> +    psignw   m1, m0
> +    movu     [r0], m1
> +    add      r0, 32
> +    add      r1, 64
> +    add      r2, 32
> +    dec      r3d
> +    jnz .loop
> +    RET
> +
>  %if ARCH_X86_64 == 1
>  %macro DCT8_PASS_1 4
>      vpbroadcastq    m0,                 [r6 + %1]
> @@ -1227,7 +1239,7 @@
>      mova            [r5 + %2],          xm2
>  %endmacro
>  
> -%macro DCT8_PASS_2 1
> +%macro DCT8_PASS_2 2
>      vbroadcasti128  m4,                 [r6 + %1]
>      pmaddwd         m6,                 m0, m4
>      pmaddwd         m7,                 m1, m4
> @@ -1238,10 +1250,25 @@
>      phaddd          m6,                 m8
>      paddd           m6,                 m5
>      psrad           m6,                 DCT_SHIFT2
> +
> +    vbroadcasti128  m4,                 [r6 + %2]
> +    pmaddwd         m10,                m0, m4
> +    pmaddwd         m7,                 m1, m4
> +    pmaddwd         m8,                 m2, m4
> +    pmaddwd         m9,                 m3, m4
> +    phaddd          m10,                m7
> +    phaddd          m8,                 m9
> +    phaddd          m10,                m8
> +    paddd           m10,                m5
> +    psrad           m10,                DCT_SHIFT2
> +
> +    packssdw        m6,                 m10
> +    vpermq          m10,                m6, 0xD8
> +
>  %endmacro
>  
>  INIT_YMM avx2
> -cglobal dct8, 3, 7, 10, 0-8*16
> +cglobal dct8, 3, 7, 11, 0-8*16
>  %if BIT_DEPTH == 10
>      %define         DCT_SHIFT          4
>      vbroadcasti128  m5,                [pd_8]
> @@ -1294,9 +1321,6 @@
>      DCT8_PASS_1     7 * 16,             7 * 16, 4, 1
>  
>      ;pass2
> -    mov             r2d,               32
> -    lea             r3,                [r2 * 3]
> -    lea             r4,                [r1 + r2 * 4]
>      vbroadcasti128  m5,                [pd_256]
>  
>      mova            m0,                [r5]
> @@ -1304,22 +1328,14 @@
>      mova            m2,                [r5 + 64]
>      mova            m3,                [r5 + 96]
>  
> -    DCT8_PASS_2     0 * 16
> -    movu            [r1],              m6
> -    DCT8_PASS_2     1 * 16
> -    movu            [r1 + r2],         m6
> -    DCT8_PASS_2     2 * 16
> -    movu            [r1 + r2 * 2],     m6
> -    DCT8_PASS_2     3 * 16
> -    movu            [r1 + r3],         m6
> -    DCT8_PASS_2     4 * 16
> -    movu            [r4],              m6
> -    DCT8_PASS_2     5 * 16
> -    movu            [r4 + r2],         m6
> -    DCT8_PASS_2     6 * 16
> -    movu            [r4 + r2 * 2],     m6
> -    DCT8_PASS_2     7 * 16
> -    movu            [r4 + r3],         m6
> +    DCT8_PASS_2     0 * 16, 1 * 16
> +    movu            [r1],              m10
> +    DCT8_PASS_2     2 * 16, 3 * 16
> +    movu            [r1 + 32],         m10
> +    DCT8_PASS_2     4 * 16, 5 * 16
> +    movu            [r1 + 64],         m10
> +    DCT8_PASS_2     6 * 16, 7 * 16
> +    movu            [r1 + 96],         m10
>      RET
>  
>  %macro DCT16_PASS_1_E 2
> @@ -1360,7 +1376,7 @@
>      mova            [r5 + %2],         xm10
>  %endmacro
>  
> -%macro DCT16_PASS_2 1
> +%macro DCT16_PASS_2 2
>      vbroadcasti128  m8,                [r7 + %1]
>      vbroadcasti128  m13,               [r8 + %1]
>  
> @@ -1385,9 +1401,40 @@
>      phaddd          m10,               m11
>      paddd           m10,               m9
>      psrad           m10,               DCT_SHIFT2
> +
> +
> +    vbroadcasti128  m8,                [r7 + %2]
> +    vbroadcasti128  m13,               [r8 + %2]
> +
> +    pmaddwd         m14,               m0, m8
> +    pmaddwd         m11,               m1, m13
> +    paddd           m14,               m11
> +
> +    pmaddwd         m11,               m2, m8
> +    pmaddwd         m12,               m3, m13
> +    paddd           m11,               m12
> +    phaddd          m14,               m11
> +
> +    pmaddwd         m11,               m4, m8
> +    pmaddwd         m12,               m5, m13
> +    paddd           m11,               m12
> +
> +    pmaddwd         m12,               m6, m8
> +    pmaddwd         m13,               m7, m13
> +    paddd           m12,               m13
> +    phaddd          m11,               m12
> +
> +    phaddd          m14,               m11
> +    paddd           m14,               m9
> +    psrad           m14,               DCT_SHIFT2
> +
> +    packssdw        m10,               m14
> +    vextracti128    xm14,              m10,       1
> +    movlhps         xm15,              xm10,      xm14
> +    movhlps         xm14,              xm10
>  %endmacro
>  INIT_YMM avx2
> -cglobal dct16, 3, 9, 15, 0-16*mmsize
> +cglobal dct16, 3, 9, 16, 0-16*mmsize
>  %if BIT_DEPTH == 10
>      %define         DCT_SHIFT          5
>      vbroadcasti128  m9,                [pd_16]
> @@ -1487,7 +1534,7 @@
>  
>      mov             r5,                rsp
>      mov             r4d,               2
> -    mov             r2d,               64
> +    mov             r2d,               32
>      lea             r3,                [r2 * 3]
>      vbroadcasti128  m9,                [pd_512]
>  
> @@ -1504,46 +1551,42 @@
>      mova            m6,                [r5 + 3 * 32]        ; [row3lo  row7lo]
>      mova            m7,                [r5 + 11 * 32]       ; [row3hi  row7hi]
>  
> -    DCT16_PASS_2    -8 * 16
> -    movu            [r1],              m10
> -    DCT16_PASS_2    -7 * 16
> -    movu            [r1 + r2],         m10
> -    DCT16_PASS_2    -6 * 16
> -    movu            [r1 + r2 * 2],     m10
> -    DCT16_PASS_2    -5 * 16
> -    movu            [r1 + r3],         m10
> +    DCT16_PASS_2    -8 * 16, -7 * 16
> +    movu            [r1],              xm15
> +    movu            [r1 + r2],         xm14
> +
> +    DCT16_PASS_2    -6 * 16, -5 * 16
> +    movu            [r1 + r2 * 2],     xm15
> +    movu            [r1 + r3],         xm14
>  
>      lea             r6,                [r1 + r2 * 4]
> -    DCT16_PASS_2    -4 * 16
> -    movu            [r6],              m10
> -    DCT16_PASS_2    -3 * 16
> -    movu            [r6 + r2],         m10
> -    DCT16_PASS_2    -2 * 16
> -    movu            [r6 + r2 * 2],     m10
> -    DCT16_PASS_2    -1 * 16
> -    movu            [r6 + r3],         m10
> +    DCT16_PASS_2    -4 * 16, -3 * 16
> +    movu            [r6],              xm15
> +    movu            [r6 + r2],         xm14
> +
> +    DCT16_PASS_2    -2 * 16, -1 * 16
> +    movu            [r6 + r2 * 2],     xm15
> +    movu            [r6 + r3],         xm14
>  
>      lea             r6,                [r6 + r2 * 4]
> -    DCT16_PASS_2    0 * 16
> -    movu            [r6],              m10
> -    DCT16_PASS_2    1 * 16
> -    movu            [r6 + r2],         m10
> -    DCT16_PASS_2    2 * 16
> -    movu            [r6 + r2 * 2],     m10
> -    DCT16_PASS_2    3 * 16
> -    movu            [r6 + r3],         m10
> +    DCT16_PASS_2    0 * 16, 1 * 16
> +    movu            [r6],              xm15
> +    movu            [r6 + r2],         xm14
> +
> +    DCT16_PASS_2    2 * 16, 3 * 16
> +    movu            [r6 + r2 * 2],     xm15
> +    movu            [r6 + r3],         xm14
>  
>      lea             r6,                [r6 + r2 * 4]
> -    DCT16_PASS_2    4 * 16
> -    movu            [r6],              m10
> -    DCT16_PASS_2    5 * 16
> -    movu            [r6 + r2],         m10
> -    DCT16_PASS_2    6 * 16
> -    movu            [r6 + r2 * 2],     m10
> -    DCT16_PASS_2    7 * 16
> -    movu            [r6 + r3],         m10
> -
> -    add             r1,                32
> +    DCT16_PASS_2    4 * 16, 5 * 16
> +    movu            [r6],              xm15
> +    movu            [r6 + r2],         xm14
> +
> +    DCT16_PASS_2    6 * 16, 7 * 16
> +    movu            [r6 + r2 * 2],     xm15
> +    movu            [r6 + r3],         xm14
> +
> +    add             r1,                16
>      add             r5,                128
>  
>      dec             r4d
> @@ -1609,6 +1652,7 @@
>  
>      paddd           xm11,               xm9
>      psrad           xm11,               DCT_SHIFT2
> +    packssdw        xm11,               xm11
>  
>  %endmacro
>  
> @@ -1704,7 +1748,7 @@
>      dec             r4d
>      jnz             .pass1
>  
> -    mov             r2d,               128
> +    mov             r2d,               64
>      lea             r3,                [r2 * 3]
>      mov             r5,                rsp
>      mov             r4d,               8
> @@ -1724,86 +1768,86 @@
>      mova            m7,                [r5 + 3 * 64 + 32]
>  
>      DCT32_PASS_2    0 * 32
> -    movu            [r1],              xm11
> +    movq            [r1],              xm11
>      DCT32_PASS_2    1 * 32
> -    movu            [r1 + r2],         xm11
> +    movq            [r1 + r2],         xm11
>      DCT32_PASS_2    2 * 32
> -    movu            [r1 + r2 * 2],     xm11
> +    movq            [r1 + r2 * 2],     xm11
>      DCT32_PASS_2    3 * 32
> -    movu            [r1 + r3],         xm11
> +    movq            [r1 + r3],         xm11
>  
>      lea             r6,                [r1 + r2 * 4]
>      DCT32_PASS_2    4 * 32
> -    movu            [r6],              xm11
> +    movq            [r6],              xm11
>      DCT32_PASS_2    5 * 32
> -    movu            [r6 + r2],         xm11
> +    movq            [r6 + r2],         xm11
>      DCT32_PASS_2    6 * 32
> -    movu            [r6 + r2 * 2],     xm11
> +    movq            [r6 + r2 * 2],     xm11
>      DCT32_PASS_2    7 * 32
> -    movu            [r6 + r3],         xm11
> +    movq            [r6 + r3],         xm11
>  
>      lea             r6,                [r6 + r2 * 4]
>      DCT32_PASS_2    8 * 32
> -    movu            [r6],              xm11
> +    movq            [r6],              xm11
>      DCT32_PASS_2    9 * 32
> -    movu            [r6 + r2],         xm11
> +    movq            [r6 + r2],         xm11
>      DCT32_PASS_2    10 * 32
> -    movu            [r6 + r2 * 2],     xm11
> +    movq            [r6 + r2 * 2],     xm11
>      DCT32_PASS_2    11 * 32
> -    movu            [r6 + r3],         xm11
> +    movq            [r6 + r3],         xm11
>  
>      lea             r6,                [r6 + r2 * 4]
>      DCT32_PASS_2    12 * 32
> -    movu            [r6],              xm11
> +    movq            [r6],              xm11
>      DCT32_PASS_2    13 * 32
> -    movu            [r6 + r2],         xm11
> +    movq            [r6 + r2],         xm11
>      DCT32_PASS_2    14 * 32
> -    movu            [r6 + r2 * 2],     xm11
> +    movq            [r6 + r2 * 2],     xm11
>      DCT32_PASS_2    15 * 32
> -    movu            [r6 + r3],         xm11
> +    movq            [r6 + r3],         xm11
>  
>      lea             r6,                [r6 + r2 * 4]
>      DCT32_PASS_2    16 * 32
> -    movu            [r6],              xm11
> +    movq            [r6],              xm11
>      DCT32_PASS_2    17 * 32
> -    movu            [r6 + r2],         xm11
> +    movq            [r6 + r2],         xm11
>      DCT32_PASS_2    18 * 32
> -    movu            [r6 + r2 * 2],     xm11
> +    movq            [r6 + r2 * 2],     xm11
>      DCT32_PASS_2    19 * 32
> -    movu            [r6 + r3],         xm11
> +    movq            [r6 + r3],         xm11
>  
>      lea             r6,                [r6 + r2 * 4]
>      DCT32_PASS_2    20 * 32
> -    movu            [r6],              xm11
> +    movq            [r6],              xm11
>      DCT32_PASS_2    21 * 32
> -    movu            [r6 + r2],         xm11
> +    movq            [r6 + r2],         xm11
>      DCT32_PASS_2    22 * 32
> -    movu            [r6 + r2 * 2],     xm11
> +    movq            [r6 + r2 * 2],     xm11
>      DCT32_PASS_2    23 * 32
> -    movu            [r6 + r3],         xm11
> +    movq            [r6 + r3],         xm11
>  
>      lea             r6,                [r6 + r2 * 4]
>      DCT32_PASS_2    24 * 32
> -    movu            [r6],              xm11
> +    movq            [r6],              xm11
>      DCT32_PASS_2    25 * 32
> -    movu            [r6 + r2],         xm11
> +    movq            [r6 + r2],         xm11
>      DCT32_PASS_2    26 * 32
> -    movu            [r6 + r2 * 2],     xm11
> +    movq            [r6 + r2 * 2],     xm11
>      DCT32_PASS_2    27 * 32
> -    movu            [r6 + r3],         xm11
> +    movq            [r6 + r3],         xm11
>  
>      lea             r6,                [r6 + r2 * 4]
>      DCT32_PASS_2    28 * 32
> -    movu            [r6],              xm11
> +    movq            [r6],              xm11
>      DCT32_PASS_2    29 * 32
> -    movu            [r6 + r2],         xm11
> +    movq            [r6 + r2],         xm11
>      DCT32_PASS_2    30 * 32
> -    movu            [r6 + r2 * 2],     xm11
> +    movq            [r6 + r2 * 2],     xm11
>      DCT32_PASS_2    31 * 32
> -    movu            [r6 + r3],         xm11
> +    movq            [r6 + r3],         xm11
>  
>      add             r5,                256
> -    add             r1,                16
> +    add             r1,                8
>  
>      dec             r4d
>      jnz             .pass2
> @@ -1926,28 +1970,25 @@
>      lea             r6,                [avx2_idct8_2]
>  
>      ;pass1
> -    mova            m0,                [r0 + 0 * 32]
> -    mova            m1,                [r0 + 4 * 32]
> -    packssdw        m0,                m1               ; [0 0 0 0 4 4 4 4 0 0 0 0 4 4 4 4]
> -    mova            m1,                [r0 + 2 * 32]
> -    mova            m2,                [r0 + 6 * 32]
> -    packssdw        m1,                m2               ; [2 2 2 2 6 6 6 6 2 2 2 2 6 6 6 6]
> -    mova            m2,                [r0 + 1 * 32]
> -    mova            m3,                [r0 + 5 * 32]
> -    packssdw        m2,                m3               ; [1 1 1 1 5 5 5 5 1 1 1 1 5 5 5 5]
> -    mova            m3,                [r0 + 3 * 32]
> -    mova            m4,                [r0 + 7 * 32]
> -    packssdw        m3,                m4               ; [3 3 3 3 7 7 7 7 3 3 3 3 7 7 7 7]
> +    mova            m1,                [r0 + 0 * 32]     ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
> +    mova            m0,                [r0 + 1 * 32]     ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3]
> +    vpunpcklwd      m5,      m1,       m0                ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
> +    vpunpckhwd      m1,      m0                          ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
> +    vinserti128     m4,      m5,       xm1,       1      ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
> +    vextracti128    xm2,     m5,       1                 ; [1 3 1 3 1 3 1 3]
> +    vinserti128     m1,      m1,       xm2,       0      ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
> +
> +    mova            m2,                [r0 + 2 * 32]     ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5]
> +    mova            m0,                [r0 + 3 * 32]     ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7]
> +    vpunpcklwd      m5,      m2,       m0                ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
> +    vpunpckhwd      m2,      m0                          ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
> +    vinserti128     m0,      m5,       xm2,       1     ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
> +    vextracti128    xm5,     m5,       1                ; [5 7 5 7 5 7 5 7]
> +    vinserti128     m2,      m2,       xm5,       0     ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
>  
>      mova            m5,                [idct8_shuf1]
> -
> -    punpcklwd       m4,                m0, m1           ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
> -    punpckhwd       m0,                m1               ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
>      vpermd          m4,                m5, m4
>      vpermd          m0,                m5, m0
> -
> -    punpcklwd       m1,                m2, m3           ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
> -    punpckhwd       m2,                m3               ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
>      vpermd          m1,                m5, m1
>      vpermd          m2,                m5, m2
>  
> @@ -2065,7 +2106,7 @@
>  %endmacro
>  
>  ;-------------------------------------------------------
> -; void idct16(int32_t *src, int16_t *dst, intptr_t stride)
> +; void idct16(int16_t *src, int16_t *dst, intptr_t stride)
>  ;-------------------------------------------------------
>  INIT_YMM avx2
>  cglobal idct16, 3, 7, 16, 0-16*mmsize
> @@ -2087,37 +2128,53 @@
>      mov             r4d, 2
>  
>  .pass1:
> -    movu            m0, [r0 +  0 * 64]
> -    movu            m1, [r0 +  8 * 64]
> -    packssdw        m0, m1                    ;[0L 8L 0H 8H]
> -
> -    movu            m1, [r0 +  1 * 64]
> -    movu            m2, [r0 +  9 * 64]
> -    packssdw        m1, m2                    ;[1L 9L 1H 9H]
> -
> -    movu            m2, [r0 +  2 * 64]
> -    movu            m3, [r0 + 10 * 64]
> -    packssdw        m2, m3                    ;[2L 10L 2H 10H]
> -
> -    movu            m3, [r0 +  3 * 64]
> -    movu            m4, [r0 + 11 * 64]
> -    packssdw        m3, m4                    ;[3L 11L 3H 11H]
> -
> -    movu            m4, [r0 +  4 * 64]
> -    movu            m5, [r0 + 12 * 64]
> -    packssdw        m4, m5                    ;[4L 12L 4H 12H]
> -
> -    movu            m5, [r0 +  5 * 64]
> -    movu            m6, [r0 + 13 * 64]
> -    packssdw        m5, m6                    ;[5L 13L 5H 13H]
> -
> -    movu            m6, [r0 +  6 * 64]
> -    movu            m7, [r0 + 14 * 64]
> -    packssdw        m6, m7                    ;[6L 14L 6H 14H]
> -
> -    movu            m7, [r0 +  7 * 64]
> -    movu            m8, [r0 + 15 * 64]
> -    packssdw        m7, m8                    ;[7L 15L 7H 15H]
> +     movu            xm0, [r0 +  0 * 32]
> +     movu            xm1, [r0 +  8 * 32]
> +     punpckhqdq      xm2, xm0, xm1
> +     punpcklqdq      xm0, xm1
> +     vinserti128     m0, m0, xm2, 1
> +
> +     movu            xm1, [r0 +  1 * 32]
> +     movu            xm2, [r0 +  9 * 32]
> +     punpckhqdq      xm3, xm1, xm2
> +     punpcklqdq      xm1, xm2
> +     vinserti128     m1, m1, xm3, 1
> +
> +     movu            xm2, [r0 + 2  * 32]
> +     movu            xm3, [r0 + 10 * 32]
> +     punpckhqdq      xm4, xm2, xm3
> +     punpcklqdq      xm2, xm3
> +     vinserti128     m2, m2, xm4, 1
> +
> +     movu            xm3, [r0 + 3  * 32]
> +     movu            xm4, [r0 + 11 * 32]
> +     punpckhqdq      xm5, xm3, xm4
> +     punpcklqdq      xm3, xm4
> +     vinserti128     m3, m3, xm5, 1
> +
> +     movu            xm4, [r0 + 4  * 32]
> +     movu            xm5, [r0 + 12 * 32]
> +     punpckhqdq      xm6, xm4, xm5
> +     punpcklqdq      xm4, xm5
> +     vinserti128     m4, m4, xm6, 1
> +
> +     movu            xm5, [r0 + 5  * 32]
> +     movu            xm6, [r0 + 13 * 32]
> +     punpckhqdq      xm7, xm5, xm6
> +     punpcklqdq      xm5, xm6
> +     vinserti128     m5, m5, xm7, 1
> +
> +     movu            xm6, [r0 + 6  * 32]
> +     movu            xm7, [r0 + 14 * 32]
> +     punpckhqdq      xm8, xm6, xm7
> +     punpcklqdq      xm6, xm7
> +     vinserti128     m6, m6, xm8, 1
> +
> +     movu            xm7, [r0 + 7  * 32]
> +     movu            xm8, [r0 + 15 * 32]
> +     punpckhqdq      xm9, xm7, xm8
> +     punpcklqdq      xm7, xm8
> +     vinserti128     m7, m7, xm9, 1
>  
>      punpckhwd       m8, m0, m2                ;[8 10]
>      punpcklwd       m0, m2                    ;[0 2]
> @@ -2160,7 +2217,7 @@
>      IDCT_PASS1      4, 10
>      IDCT_PASS1      6, 8
>  
> -    add             r0, 32
> +    add             r0, 16
>      add             r3, 16
>      dec             r4d
>      jnz             .pass1
> @@ -2328,7 +2385,7 @@
>  %endmacro
>  
>  ;-------------------------------------------------------
> -; void idct32(int32_t *src, int16_t *dst, intptr_t stride)
> +; void idct32(int16_t *src, int16_t *dst, intptr_t stride)
>  ;-------------------------------------------------------
>  
>  ; TODO: Reduce PHADDD instruction by PADDD
> @@ -2345,54 +2402,69 @@
>      mov             r5d, 8
>  
>  .pass1:
> -    movu            xm0,    [r0 +  2 * 128]
> -    movu            xm1,    [r0 + 18 * 128]
> -    vinserti128     m0, m0, [r0 +  0 * 128], 1
> -    vinserti128     m1, m1, [r0 + 16 * 128], 1
> -
> -    packssdw        m0, m1                      ;[2 18 0 16]
> -
> -    movu            xm1,    [r0 +  1 * 128]
> -    movu            xm2,    [r0 +  9 * 128]
> -    vinserti128     m1, m1, [r0 + 17 * 128], 1
> -    vinserti128     m2, m2, [r0 + 25 * 128], 1
> -    packssdw        m1, m2                      ;[1 9 17 25]
> -
> -    movu            xm2,    [r0 +  6 * 128]
> -    movu            xm3,    [r0 + 22 * 128]
> -    vinserti128     m2, m2, [r0 +  4 * 128], 1
> -    vinserti128     m3, m3, [r0 + 20 * 128], 1
> -    packssdw        m2, m3                      ;[6 22 4 20]
> -
> -    movu            xm3,    [r0 +  3 * 128]
> -    movu            xm4,    [r0 + 11 * 128]
> -    vinserti128     m3, m3, [r0 + 19 * 128], 1
> -    vinserti128     m4, m4, [r0 + 27 * 128], 1
> -    packssdw        m3, m4                      ;[3 11 19 27]
> -
> -    movu            xm4,    [r0 + 10 * 128]
> -    movu            xm5,    [r0 + 26 * 128]
> -    vinserti128     m4, m4, [r0 +  8 * 128], 1
> -    vinserti128     m5, m5, [r0 + 24 * 128], 1
> -    packssdw        m4, m5                      ;[10 26 8 24]
> -
> -    movu            xm5,    [r0 +  5 * 128]
> -    movu            xm6,    [r0 + 13 * 128]
> -    vinserti128     m5, m5, [r0 + 21 * 128], 1
> -    vinserti128     m6, m6, [r0 + 29 * 128], 1
> -    packssdw        m5, m6                      ;[5 13 21 29]
> -
> -    movu            xm6,    [r0 + 14 * 128]
> -    movu            xm7,    [r0 + 30 * 128]
> -    vinserti128     m6, m6, [r0 + 12 * 128], 1
> -    vinserti128     m7, m7, [r0 + 28 * 128], 1
> -    packssdw        m6, m7                      ;[14 30 12 28]
> -
> -    movu            xm7,    [r0 +  7 * 128]
> -    movu            xm8,    [r0 + 15 * 128]
> -    vinserti128     m7, m7, [r0 + 23 * 128], 1
> -    vinserti128     m8, m8, [r0 + 31 * 128], 1
> -    packssdw        m7, m8                      ;[7 15 23 31]
> +    movq            xm0,    [r0 +  2 * 64]
> +    movq            xm1,    [r0 + 18 * 64]
> +    punpcklqdq      xm0, xm0, xm1
> +    movq            xm1,    [r0 +  0 * 64]
> +    movq            xm2,    [r0 + 16 * 64]
> +    punpcklqdq      xm1, xm1, xm2
> +    vinserti128     m0,  m0,  xm1, 1             ;[2 18 0 16]
> +
> +    movq            xm1,    [r0 + 1 * 64]
> +    movq            xm2,    [r0 + 9 * 64]
> +    punpcklqdq      xm1, xm1, xm2
> +    movq            xm2,    [r0 + 17 * 64]
> +    movq            xm3,    [r0 + 25 * 64]
> +    punpcklqdq      xm2, xm2, xm3
> +    vinserti128     m1,  m1,  xm2, 1             ;[1 9 17 25]
> +
> +    movq            xm2,    [r0 + 6 * 64]
> +    movq            xm3,    [r0 + 22 * 64]
> +    punpcklqdq      xm2, xm2, xm3
> +    movq            xm3,    [r0 + 4 * 64]
> +    movq            xm4,    [r0 + 20 * 64]
> +    punpcklqdq      xm3, xm3, xm4
> +    vinserti128     m2,  m2,  xm3, 1             ;[6 22 4 20]
> +
> +    movq            xm3,    [r0 + 3 * 64]
> +    movq            xm4,    [r0 + 11 * 64]
> +    punpcklqdq      xm3, xm3, xm4
> +    movq            xm4,    [r0 + 19 * 64]
> +    movq            xm5,    [r0 + 27 * 64]
> +    punpcklqdq      xm4, xm4, xm5
> +    vinserti128     m3,  m3,  xm4, 1             ;[3 11 17 25]
> +
> +    movq            xm4,    [r0 + 10 * 64]
> +    movq            xm5,    [r0 + 26 * 64]
> +    punpcklqdq      xm4, xm4, xm5
> +    movq            xm5,    [r0 + 8 * 64]
> +    movq            xm6,    [r0 + 24 * 64]
> +    punpcklqdq      xm5, xm5, xm6
> +    vinserti128     m4,  m4,  xm5, 1             ;[10 26 8 24]
> +
> +    movq            xm5,    [r0 + 5 * 64]
> +    movq            xm6,    [r0 + 13 * 64]
> +    punpcklqdq      xm5, xm5, xm6
> +    movq            xm6,    [r0 + 21 * 64]
> +    movq            xm7,    [r0 + 29 * 64]
> +    punpcklqdq      xm6, xm6, xm7
> +    vinserti128     m5,  m5,  xm6, 1             ;[5 13 21 9]
> +
> +    movq            xm6,    [r0 + 14 * 64]
> +    movq            xm7,    [r0 + 30 * 64]
> +    punpcklqdq      xm6, xm6, xm7
> +    movq            xm7,    [r0 + 12 * 64]
> +    movq            xm8,    [r0 + 28 * 64]
> +    punpcklqdq      xm7, xm7, xm8
> +    vinserti128     m6,  m6,  xm7, 1             ;[14 30 12 28]
> +
> +    movq            xm7,    [r0 + 7 * 64]
> +    movq            xm8,    [r0 + 15 * 64]
> +    punpcklqdq      xm7, xm7, xm8
> +    movq            xm8,    [r0 + 23 * 64]
> +    movq            xm9,    [r0 + 31 * 64]
> +    punpcklqdq      xm8, xm8, xm9
> +    vinserti128     m7,  m7,  xm8, 1             ;[7 15 23 31]
>  
>      punpckhwd       m8, m0, m2                  ;[18 22 16 20]
>      punpcklwd       m0, m2                      ;[2 6 0 4]
> @@ -2451,7 +2523,7 @@
>      IDCT32_PASS1 6
>      IDCT32_PASS1 7
>  
> -    add             r0, 16
> +    add             r0, 8
>      add             r3, 4
>      add             r4, 4
>      dec             r5d
> @@ -2612,7 +2684,7 @@
>      RET
>  
>  ;-------------------------------------------------------
> -; void idct4(int32_t *src, int16_t *dst, intptr_t stride)
> +; void idct4(int16_t *src, int16_t *dst, intptr_t stride)
>  ;-------------------------------------------------------
>  INIT_YMM avx2
>  cglobal idct4, 3, 4, 6
> @@ -2632,13 +2704,14 @@
>      add             r2d, r2d
>      lea             r3, [r2 * 3]
>  
> -    movu            m0, [r0]                      ;[00 01 02 03 10 11 12 13]
> -    movu            m1, [r0 + 32]                 ;[20 21 22 23 30 31 32 33]
> -
> -    packssdw        m0, m1                        ;[00 01 02 03 20 21 22 23 10 11 12 13 30 31 32 33]
> -    pshufb          m0, [idct4_shuf1]             ;[00 20 02 22 01 21 03 23 10 30 12 32 11 31 13 33]
> -    vpermq          m2, m0, 0x44                  ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
> -    vpermq          m0, m0, 0xEE                  ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
> +    movu            m0, [r0]                      ;[00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33]
> +
> +    pshufb          m0, [idct4_shuf1]             ;[00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33]
> +    vextracti128    xm1, m0, 1                    ;[20 22 21 23 30 32 31 33]
> +    punpcklwd       xm2, xm0, xm1                 ;[00 20 02 22 01 21 03 23]
> +    punpckhwd       xm0, xm1                      ;[10 30 12 32 11 31 13 33]
> +    vinserti128     m2, m2, xm2, 1                ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
> +    vinserti128     m0, m0, xm0, 1                ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
>  
>      mova            m1, [avx2_idct4_1]
>      mova            m3, [avx2_idct4_1 + 32]
> diff -r 2f0062f0791b -r 706fa4af912b source/common/x86/dct8.h
> --- a/source/common/x86/dct8.h	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/x86/dct8.h	Tue Nov 18 14:00:27 2014 +0530
> @@ -23,23 +23,23 @@
>  
>  #ifndef X265_DCT8_H
>  #define X265_DCT8_H
> -void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
> -void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
> -void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
> -void x265_dct4_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> -void x265_dct8_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> -void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> -void x265_dct32_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> -void x265_idct32_avx2(int32_t *src, int16_t *dst, intptr_t stride);
> +void x265_dct4_sse2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_dst4_ssse3(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_dct8_sse4(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_dct4_avx2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_dct8_avx2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_dct16_avx2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_dct32_avx2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_idct32_avx2(int16_t *src, int16_t *dst, intptr_t stride);
>  
> -void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
> -void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
> -void x265_idct4_avx2(int32_t *src, int16_t *dst, intptr_t stride);
> -void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
> -void x265_idct8_avx2(int32_t *src, int16_t *dst, intptr_t stride);
> -void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride);
> +void x265_idst4_sse2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_idct4_sse2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_idct4_avx2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_idct8_ssse3(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_idct8_avx2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_idct16_avx2(int16_t *src, int16_t *dst, intptr_t stride);
>  
> -void x265_denoise_dct_sse4(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> -void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> +void x265_denoise_dct_sse4(int16_t *dct, uint32_t *sum, uint16_t *offset, int size);
> +void x265_denoise_dct_avx2(int16_t *dct, uint32_t *sum, uint16_t *offset, int size);
>  
>  #endif // ifndef X265_DCT8_H
> diff -r 2f0062f0791b -r 706fa4af912b source/common/x86/pixel-util.h
> --- a/source/common/x86/pixel-util.h	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/x86/pixel-util.h	Tue Nov 18 14:00:27 2014 +0530
> @@ -42,12 +42,12 @@
>  void x265_transpose32_avx2(pixel *dest, pixel *src, intptr_t stride);
>  void x265_transpose64_avx2(pixel *dest, pixel *src, intptr_t stride);
>  
> -uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> -uint32_t x265_quant_avx2(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> -uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> -uint32_t x265_nquant_avx2(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> -void x265_dequant_normal_sse4(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
> -void x265_dequant_normal_avx2(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
> +uint32_t x265_quant_sse4(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> +uint32_t x265_quant_avx2(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> +uint32_t x265_nquant_sse4(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> +uint32_t x265_nquant_avx2(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> +void x265_dequant_normal_sse4(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
> +void x265_dequant_normal_avx2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
>  int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
>  
>  void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
> diff -r 2f0062f0791b -r 706fa4af912b source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/x86/pixel-util8.asm	Tue Nov 18 14:00:27 2014 +0530
> @@ -420,7 +420,7 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> +; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal quant, 5,6,8
> @@ -442,7 +442,7 @@
>      pxor        m7, m7          ; m7 = numZero
>  .loop:
>      ; 4 coeff
> -    movu        m0, [r0]        ; m0 = level
> +    pmovsxwd    m0, [r0]        ; m0 = level
>      pabsd       m1, m0
>      pmulld      m1, [r1]        ; m0 = tmpLevel1
>      paddd       m2, m1, m5
> @@ -460,7 +460,7 @@
>      movh        [r3], m3
>  
>      ; 4 coeff
> -    movu        m0, [r0 + 16]   ; m0 = level
> +    pmovsxwd    m0, [r0 + 8]    ; m0 = level
>      pabsd       m1, m0
>      pmulld      m1, [r1 + 16]   ; m0 = tmpLevel1
>      paddd       m2, m1, m5
> @@ -475,7 +475,7 @@
>      packssdw    m3, m3
>      movh        [r3 + 8], m3
>  
> -    add         r0, 32
> +    add         r0, 16
>      add         r1, 32
>      add         r2, 32
>      add         r3, 16
> @@ -512,7 +512,7 @@
>      pxor            m7, m7              ; m7 = numZero
>  .loop:
>      ; 8 coeff
> -    movu            m0, [r0]            ; m0 = level
> +    pmovsxwd        m0, [r0]            ; m0 = level
>      pabsd           m1, m0
>      pmulld          m1, [r1]            ; m0 = tmpLevel1
>      paddd           m2, m1, m5
> @@ -525,7 +525,7 @@
>      psignd          m2, m0
>  
>      ; 8 coeff
> -    movu            m0, [r0 + mmsize]   ; m0 = level
> +    pmovsxwd        m0, [r0 + mmsize/2] ; m0 = level
>      pabsd           m1, m0
>      pmulld          m1, [r1 + mmsize]   ; m0 = tmpLevel1
>      paddd           m3, m1, m5
> @@ -546,7 +546,7 @@
>      pminuw          m2, m9
>      paddw           m7, m2
>  
> -    add             r0, mmsize*2
> +    add             r0, mmsize
>      add             r1, mmsize*2
>      add             r2, mmsize*2
>      add             r3, mmsize
> @@ -584,7 +584,7 @@
>      pxor            m7, m7          ; m7 = numZero
>  .loop:
>      ; 8 coeff
> -    movu            m0, [r0]        ; m0 = level
> +    pmovsxwd        m0, [r0]        ; m0 = level
>      pabsd           m1, m0
>      pmulld          m1, [r1]        ; m0 = tmpLevel1
>      paddd           m2, m1, m5
> @@ -603,7 +603,7 @@
>      movu            [r3], xm3
>  
>      ; 8 coeff
> -    movu            m0, [r0 + mmsize]        ; m0 = level
> +    pmovsxwd        m0, [r0 + mmsize/2]        ; m0 = level
>      pabsd           m1, m0
>      pmulld          m1, [r1 + mmsize]        ; m0 = tmpLevel1
>      paddd           m2, m1, m5
> @@ -621,7 +621,7 @@
>      vpermq          m3, m3, q0020
>      movu            [r3 + mmsize/2], xm3
>  
> -    add             r0, mmsize*2
> +    add             r0, mmsize
>      add             r1, mmsize*2
>      add             r2, mmsize*2
>      add             r3, mmsize
> @@ -642,7 +642,7 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> +; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal nquant, 3,5,8
> @@ -655,8 +655,8 @@
>      shr         r4d, 3
>  
>  .loop:
> -    movu        m0, [r0]        ; m0 = level
> -    movu        m1, [r0 + 16]   ; m1 = level
> +    pmovsxwd    m0, [r0]        ; m0 = level
> +    pmovsxwd    m1, [r0 + 8]    ; m1 = level
>  
>      pabsd       m2, m0
>      pmulld      m2, [r1]        ; m0 = tmpLevel1 * qcoeff
> @@ -673,7 +673,7 @@
>      packssdw    m2, m3
>  
>      movu        [r2], m2
> -    add         r0, 32
> +    add         r0, 16
>      add         r1, 32
>      add         r2, 16
>  
> @@ -703,14 +703,14 @@
>      shr         r4d, 4
>  
>  .loop:
> -    movu        m0, [r0]            ; m0 = level
> +    pmovsxwd    m0, [r0]            ; m0 = level
>      pabsd       m1, m0
>      pmulld      m1, [r1]            ; m0 = tmpLevel1 * qcoeff
>      paddd       m1, m4
>      psrad       m1, xm3             ; m0 = level1
>      psignd      m1, m0
>  
> -    movu        m0, [r0 + mmsize]   ; m0 = level
> +    pmovsxwd    m0, [r0 + mmsize/2] ; m0 = level
>      pabsd       m2, m0
>      pmulld      m2, [r1 + mmsize]   ; m0 = tmpLevel1 * qcoeff
>      paddd       m2, m4
> @@ -721,7 +721,7 @@
>      vpermq      m2, m1, q3120
>  
>      movu        [r2], m2
> -    add         r0, mmsize * 2
> +    add         r0, mmsize
>      add         r1, mmsize * 2
>      add         r2, mmsize
>  
> @@ -770,15 +770,11 @@
>      pmaddwd     m4, m1
>      psrad       m3, m0
>      psrad       m4, m0
> -    packssdw    m3, m3              ; OPT_ME: store must be 32 bits
> -    pmovsxwd    m3, m3
> -    packssdw    m4, m4
> -    pmovsxwd    m4, m4
> +    packssdw    m3, m4
>      mova        [r1], m3
> -    mova        [r1 + 16], m4
>  
>      add         r0, 16
> -    add         r1, 32
> +    add         r1, 16
>  
>      sub         r2d, 8
>      jnz        .loop
> @@ -818,13 +814,12 @@
>      pmaxsd          m3, m6
>      pminsd          m4, m5
>      pmaxsd          m4, m6
> +    packssdw        m3, m4
>      mova            [r1 + 0 * mmsize/2], xm3
> -    mova            [r1 + 1 * mmsize/2], xm4
> -    vextracti128    [r1 + 2 * mmsize/2], m3, 1
> -    vextracti128    [r1 + 3 * mmsize/2], m4, 1
> +    vextracti128    [r1 + 1 * mmsize/2], m3, 1
>  
>      add             r0, mmsize
> -    add             r1, mmsize * 2
> +    add             r1, mmsize
>  
>      dec             r2d
>      jnz            .loop
> diff -r 2f0062f0791b -r 706fa4af912b source/test/mbdstharness.cpp
> --- a/source/test/mbdstharness.cpp	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/test/mbdstharness.cpp	Tue Nov 18 14:00:27 2014 +0530
> @@ -65,17 +65,17 @@
>          short_test_buff[0][i]    = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
>          int_test_buff[0][i]      = rand() % PIXEL_MAX;
>          int_idct_test_buff[0][i] = (rand() % (SHORT_MAX - SHORT_MIN)) - SHORT_MAX;
> -        int_denoise_test_buff1[0][i] = int_denoise_test_buff2[0][i] = (rand() & UNSIGNED_SHORT_MAX) - (rand() & UNSIGNED_SHORT_MAX);
> +        short_denoise_test_buff1[0][i] = short_denoise_test_buff2[0][i] = (rand() & SHORT_MAX) - (rand() & SHORT_MAX);
>  
>          short_test_buff[1][i]    = -PIXEL_MAX;
>          int_test_buff[1][i]      = -PIXEL_MAX;
>          int_idct_test_buff[1][i] = SHORT_MIN;
> -        int_denoise_test_buff1[1][i] = int_denoise_test_buff2[1][i] = -UNSIGNED_SHORT_MAX;
> +        short_denoise_test_buff1[1][i] = short_denoise_test_buff2[1][i] = -SHORT_MAX;
>  
>          short_test_buff[2][i]    = PIXEL_MAX;
>          int_test_buff[2][i]      = PIXEL_MAX;
>          int_idct_test_buff[2][i] = SHORT_MAX;
> -        int_denoise_test_buff1[2][i] = int_denoise_test_buff2[2][i] = UNSIGNED_SHORT_MAX;
> +        short_denoise_test_buff1[2][i] = short_denoise_test_buff2[2][i] = SHORT_MAX;
>  
>          mbuf1[i] = rand() & PIXEL_MAX;
>          mbufdct[i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
> @@ -96,16 +96,16 @@
>  bool MBDstHarness::check_dct_primitive(dct_t ref, dct_t opt, intptr_t width)
>  {
>      int j = 0;
> -    intptr_t cmp_size = sizeof(int) * width * width;
> +    intptr_t cmp_size = sizeof(short) * width * width;
>  
>      for (int i = 0; i < ITERS; i++)
>      {
>          int index = rand() % TEST_CASES;
>  
> -        ref(short_test_buff[index] + j, mintbuf3, width);
> -        checked(opt, short_test_buff[index] + j, mintbuf4, width);
> +        ref(short_test_buff[index] + j, mshortbuf2, width);
> +        checked(opt, short_test_buff[index] + j, mshortbuf3, width);
>  
> -        if (memcmp(mintbuf3, mintbuf4, cmp_size))
> +        if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
>              return false;
>  
>          reportfail();
> @@ -124,8 +124,8 @@
>      {
>          int index = rand() % TEST_CASES;
>  
> -        ref(int_idct_test_buff[index] + j, mshortbuf2, width);
> -        checked(opt, int_idct_test_buff[index] + j, mshortbuf3, width);
> +        ref(short_test_buff[index] + j, mshortbuf2, width);
> +        checked(opt, short_test_buff[index] + j, mshortbuf3, width);
>  
>          if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
>              return false;
> @@ -156,10 +156,10 @@
>          int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
>          int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
>  
> -        ref(short_test_buff[index] + j, mintbuf3, width * height, scale, shift);
> -        checked(opt, short_test_buff[index] + j, mintbuf4, width * height, scale, shift);
> +        ref(short_test_buff[index] + j, mshortbuf2, width * height, scale, shift);
> +        checked(opt, short_test_buff[index] + j, mshortbuf3, width * height, scale, shift);
>  
> -        if (memcmp(mintbuf3, mintbuf4, sizeof(int) * height * width))
> +        if (memcmp(mshortbuf2, mshortbuf3, sizeof(int16_t) * height * width))
>              return false;
>  
>          reportfail();
> @@ -175,6 +175,10 @@
>  
>      for (int i = 0; i < ITERS; i++)
>      {
> +
> +        memset(mshortbuf2, 0, MAX_TU_SIZE * sizeof(int16_t));
> +        memset(mshortbuf3, 0, MAX_TU_SIZE * sizeof(int16_t));
> +
>          int log2TrSize = (rand() % 4) + 2;
>  
>          int width = (1 << log2TrSize);
> @@ -185,13 +189,13 @@
>          int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
>          int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
>  
> -        int cmp_size = sizeof(int) * height * width;
> +        int cmp_size = sizeof(int16_t) * height * width;
>          int index1 = rand() % TEST_CASES;
>  
> -        ref(short_test_buff[index1] + j, mintbuf3, mintbuf1, width * height, per, shift);
> -        checked(opt, short_test_buff[index1] + j, mintbuf4, mintbuf2, width * height, per, shift);
> +        ref(short_test_buff[index1] + j, int_test_buff[index1] + j, mshortbuf2, width * height, per, shift);
> +        checked(opt, short_test_buff[index1] + j, int_test_buff[index1] + j, mshortbuf3, width * height, per, shift);
>  
> -        if (memcmp(mintbuf1, mintbuf2, cmp_size))
> +        if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
>              return false;
>  
>          reportfail();
> @@ -222,8 +226,8 @@
>          int index1 = rand() % TEST_CASES;
>          int index2 = rand() % TEST_CASES;
>  
> -        refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mshortbuf2, bits, valueToAdd, numCoeff);
> -        optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mshortbuf3, bits, valueToAdd, numCoeff);
> +        refReturnValue = ref(short_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mshortbuf2, bits, valueToAdd, numCoeff);
> +        optReturnValue = (uint32_t)checked(opt, short_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mshortbuf3, bits, valueToAdd, numCoeff);
>  
>          if (memcmp(mintbuf1, mintbuf3, cmp_size))
>              return false;
> @@ -261,8 +265,8 @@
>          int index1 = rand() % TEST_CASES;
>          int index2 = rand() % TEST_CASES;
>  
> -        refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf2, bits, valueToAdd, numCoeff);
> -        optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf3, bits, valueToAdd, numCoeff);
> +        refReturnValue = ref(short_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf2, bits, valueToAdd, numCoeff);
> +        optReturnValue = (uint32_t)checked(opt, short_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf3, bits, valueToAdd, numCoeff);
>  
>          if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
>              return false;
> @@ -324,6 +328,7 @@
>          int log2TrSize = s + 2;
>          int num = 1 << (log2TrSize * 2);
>          int cmp_size = sizeof(int) * num;
> +        int cmp_short = sizeof(short) * num;
>  
>          for (int i = 0; i < ITERS; i++)
>          {
> @@ -336,10 +341,10 @@
>  
>              int index = rand() % TEST_CASES;
>  
> -            ref(int_denoise_test_buff1[index] + j, mubuf1, mushortbuf1, num);
> -            checked(opt, int_denoise_test_buff2[index] + j, mubuf2, mushortbuf1, num);
> +            ref(short_denoise_test_buff1[index] + j, mubuf1, mushortbuf1, num);
> +            checked(opt, short_denoise_test_buff2[index] + j, mubuf2, mushortbuf1, num);
>  
> -            if (memcmp(int_denoise_test_buff1[index] + j, int_denoise_test_buff2[index] + j, cmp_size))
> +            if (memcmp(short_denoise_test_buff1[index] + j, short_denoise_test_buff2[index] + j, cmp_short))
>                  return false;
>  
>              if (memcmp(mubuf1, mubuf2, cmp_size))
> @@ -454,7 +459,7 @@
>          if (opt.dct[value])
>          {
>              printf("%s\t", dctInfo[value].name);
> -            REPORT_SPEEDUP(opt.dct[value], ref.dct[value], mbuf1, mintbuf3, dctInfo[value].width);
> +            REPORT_SPEEDUP(opt.dct[value], ref.dct[value], mbuf1, mshortbuf2, dctInfo[value].width);
>          }
>      }
>  
> @@ -463,32 +468,32 @@
>          if (opt.idct[value])
>          {
>              printf("%s\t", idctInfo[value].name);
> -            REPORT_SPEEDUP(opt.idct[value], ref.idct[value], mbufidct, mshortbuf2, idctInfo[value].width);
> +            REPORT_SPEEDUP(opt.idct[value], ref.idct[value], mshortbuf3, mshortbuf2, idctInfo[value].width);
>          }
>      }
>  
>      if (opt.dequant_normal)
>      {
>          printf("dequant_normal\t");
> -        REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mintbuf3, 32 * 32, 70, 1);
> +        REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, 70, 1);
>      }
>  
>      if (opt.dequant_scaling)
>      {
>          printf("dequant_scaling\t");
> -        REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mintbuf4, 32 * 32, 5, 1);
> +        REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mshortbuf2, 32 * 32, 5, 1);
>      }
>  
>      if (opt.quant)
>      {
>          printf("quant\t\t");
> -        REPORT_SPEEDUP(opt.quant, ref.quant, int_test_buff[0], int_test_buff[1], mintbuf3, mshortbuf2, 23, 23785, 32 * 32);
> +        REPORT_SPEEDUP(opt.quant, ref.quant, short_test_buff[0], int_test_buff[1], mintbuf3, mshortbuf2, 23, 23785, 32 * 32);
>      }
>  
>      if (opt.nquant)
>      {
>          printf("nquant\t\t");
> -        REPORT_SPEEDUP(opt.nquant, ref.nquant, int_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32);
> +        REPORT_SPEEDUP(opt.nquant, ref.nquant, short_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32);
>      }
>  
>      if (opt.count_nonzero)
> @@ -503,7 +508,7 @@
>      if (opt.denoiseDct)
>      {
>          printf("denoiseDct\t");
> -        REPORT_SPEEDUP(opt.denoiseDct, ref.denoiseDct, int_denoise_test_buff1[0], mubuf1, mushortbuf1, 32 * 32);
> +        REPORT_SPEEDUP(opt.denoiseDct, ref.denoiseDct, short_denoise_test_buff1[0], mubuf1, mushortbuf1, 32 * 32);
>      }
>  
>  }
> diff -r 2f0062f0791b -r 706fa4af912b source/test/mbdstharness.h
> --- a/source/test/mbdstharness.h	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/test/mbdstharness.h	Tue Nov 18 14:00:27 2014 +0530
> @@ -60,8 +60,8 @@
>      uint32_t mubuf2[MAX_TU_SIZE];
>      uint16_t mushortbuf1[MAX_TU_SIZE];
>  
> -    int int_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE];
> -    int int_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE];
> +    int16_t short_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE];
> +    int16_t short_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE];
>  
>      bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt);
>      bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt);
> diff -r 2f0062f0791b -r 706fa4af912b source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/test/pixelharness.cpp	Tue Nov 18 14:00:27 2014 +0530
> @@ -344,39 +344,11 @@
>      return true;
>  }
>  
> -bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt)
> +bool PixelHarness::check_copy16to16_shl_t(cpy16to16_shl_t ref, cpy16to16_shl_t opt)
>  {
>      ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
>      ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
>  
> -    memset(ref_dest, 0xCD, sizeof(ref_dest));
> -    memset(opt_dest, 0xCD, sizeof(opt_dest));
> -
> -    int j = 0;
> -    intptr_t stride = STRIDE;
> -    for (int i = 0; i < ITERS; i++)
> -    {
> -        int shift = (rand() % 7 + 1);
> -
> -        int index = i % TEST_CASES;
> -        checked(opt, opt_dest, int_test_buff[index] + j, stride, shift, (int)STRIDE);
> -        ref(ref_dest, int_test_buff[index] + j, stride, shift, (int)STRIDE);
> -
> -        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
> -            return false;
> -
> -        reportfail();
> -        j += INCR;
> -    }
> -
> -    return true;
> -}
> -
> -bool PixelHarness::check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt)
> -{
> -    ALIGN_VAR_16(int32_t, ref_dest[64 * 64]);
> -    ALIGN_VAR_16(int32_t, opt_dest[64 * 64]);
> -
>      int j = 0;
>      intptr_t stride = STRIDE;
>      for (int i = 0; i < ITERS; i++)
> @@ -387,7 +359,7 @@
>          checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
>          ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
>  
> -        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
> +        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
>              return false;
>  
>          reportfail();
> @@ -1337,20 +1309,11 @@
>  
>      }
>  
> -    if (opt.cvt32to16_shr)
> +    if (opt.cpy16to16_shl)
>      {
> -        if (!check_cvt32to16_shr_t(ref.cvt32to16_shr, opt.cvt32to16_shr))
> +        if (!check_copy16to16_shl_t(ref.cpy16to16_shl, opt.cpy16to16_shl))
>          {
> -            printf("cvt32to16 failed!\n");
> -            return false;
> -        }
> -    }
> -
> -    if (opt.cvt16to32_shl)
> -    {
> -        if (!check_cvt16to32_shl_t(ref.cvt16to32_shl, opt.cvt16to32_shl))
> -        {
> -            printf("cvt16to32_shl failed!\n");
> +            printf("copy16to16_shl failed!\n");
>              return false;
>          }
>      }
> @@ -1700,16 +1663,10 @@
>  
>      }
>  
> -    if (opt.cvt32to16_shr)
> +    if (opt.cpy16to16_shl)
>      {
> -        HEADER0("cvt32to16_shr");
> -        REPORT_SPEEDUP(opt.cvt32to16_shr, ref.cvt32to16_shr, sbuf1, ibuf1, 64, 5, 64);
> -    }
> -
> -    if (opt.cvt16to32_shl)
> -    {
> -        HEADER0("cvt16to32_shl");
> -        REPORT_SPEEDUP(opt.cvt16to32_shl, ref.cvt16to32_shl, ibuf1, sbuf1, 64, 5, 64);
> +        HEADER0("cpy16to16_shl");
> +        REPORT_SPEEDUP(opt.cpy16to16_shl, ref.cpy16to16_shl, sbuf2, sbuf1, 64, 5, 64);
>      }
>  
>      if (opt.weight_pp)
> diff -r 2f0062f0791b -r 706fa4af912b source/test/pixelharness.h
> --- a/source/test/pixelharness.h	Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/test/pixelharness.h	Tue Nov 18 14:00:27 2014 +0530
> @@ -80,8 +80,7 @@
>      bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);
>      bool check_weightp(weightp_sp_t ref, weightp_sp_t opt);
>      bool check_downscale_t(downscale_t ref, downscale_t opt);
> -    bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt);
> -    bool check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt);
> +    bool check_copy16to16_shl_t(cpy16to16_shl_t ref, cpy16to16_shl_t opt);
>      bool check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt);
>      bool check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt);
>      bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho