[x265] primitives: refactor tskip related

Steve Borho steve at borho.org
Fri Nov 28 20:51:07 CET 2014


On 11/27, Satoshi Nakagawa wrote:
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1417050723 -32400
> #      Thu Nov 27 10:12:03 2014 +0900
> # Node ID b4454aa1b6ab610c20241eb8fd5c73268b1ae3e0
> # Parent  dfe0803ae6be925281cd6101fc0354a34bedfefd
> primitives: refactor tskip related

Pushed as two patches - one for the nits and the other for the copy
primitive API changes.

> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/dct.cpp
> --- a/source/common/dct.cpp	Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/dct.cpp	Thu Nov 27 10:12:03 2014 +0900
> @@ -440,7 +440,7 @@
>      }
>  }
>  
> -void dst4_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
>  {
>      const int shift_1st = 1 + X265_DEPTH - 8;
>      const int shift_2nd = 8;
> @@ -450,14 +450,14 @@
>  
>      for (int i = 0; i < 4; i++)
>      {
> -        memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
> +        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
>      }
>  
>      fastForwardDst(block, coef, shift_1st);
>      fastForwardDst(coef, dst, shift_2nd);
>  }
>  
> -void dct4_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
>  {
>      const int shift_1st = 1 + X265_DEPTH - 8;
>      const int shift_2nd = 8;
> @@ -467,14 +467,14 @@
>  
>      for (int i = 0; i < 4; i++)
>      {
> -        memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
> +        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
>      }
>  
>      partialButterfly4(block, coef, shift_1st, 4);
>      partialButterfly4(coef, dst, shift_2nd, 4);
>  }
>  
> -void dct8_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
>  {
>      const int shift_1st = 2 + X265_DEPTH - 8;
>      const int shift_2nd = 9;
> @@ -484,14 +484,14 @@
>  
>      for (int i = 0; i < 8; i++)
>      {
> -        memcpy(&block[i * 8], &src[i * stride], 8 * sizeof(int16_t));
> +        memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
>      }
>  
>      partialButterfly8(block, coef, shift_1st, 8);
>      partialButterfly8(coef, dst, shift_2nd, 8);
>  }
>  
> -void dct16_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
>  {
>      const int shift_1st = 3 + X265_DEPTH - 8;
>      const int shift_2nd = 10;
> @@ -501,14 +501,14 @@
>  
>      for (int i = 0; i < 16; i++)
>      {
> -        memcpy(&block[i * 16], &src[i * stride], 16 * sizeof(int16_t));
> +        memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
>      }
>  
>      partialButterfly16(block, coef, shift_1st, 16);
>      partialButterfly16(coef, dst, shift_2nd, 16);
>  }
>  
> -void dct32_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
>  {
>      const int shift_1st = 4 + X265_DEPTH - 8;
>      const int shift_2nd = 11;
> @@ -518,14 +518,14 @@
>  
>      for (int i = 0; i < 32; i++)
>      {
> -        memcpy(&block[i * 32], &src[i * stride], 32 * sizeof(int16_t));
> +        memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
>      }
>  
>      partialButterfly32(block, coef, shift_1st, 32);
>      partialButterfly32(coef, dst, shift_2nd, 32);
>  }
>  
> -void idst4_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
>  {
>      const int shift_1st = 7;
>      const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -538,11 +538,11 @@
>  
>      for (int i = 0; i < 4; i++)
>      {
> -        memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t));
> +        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
>      }
>  }
>  
> -void idct4_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
>  {
>      const int shift_1st = 7;
>      const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -555,11 +555,11 @@
>  
>      for (int i = 0; i < 4; i++)
>      {
> -        memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t));
> +        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
>      }
>  }
>  
> -void idct8_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
>  {
>      const int shift_1st = 7;
>      const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -569,13 +569,14 @@
>  
>      partialButterflyInverse8(src, coef, shift_1st, 8);
>      partialButterflyInverse8(coef, block, shift_2nd, 8);
> +
>      for (int i = 0; i < 8; i++)
>      {
> -        memcpy(&dst[i * stride], &block[i * 8], 8 * sizeof(int16_t));
> +        memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t));
>      }
>  }
>  
> -void idct16_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
>  {
>      const int shift_1st = 7;
>      const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -585,13 +586,14 @@
>  
>      partialButterflyInverse16(src, coef, shift_1st, 16);
>      partialButterflyInverse16(coef, block, shift_2nd, 16);
> +
>      for (int i = 0; i < 16; i++)
>      {
> -        memcpy(&dst[i * stride], &block[i * 16], 16 * sizeof(int16_t));
> +        memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
>      }
>  }
>  
> -void idct32_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
>  {
>      const int shift_1st = 7;
>      const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -604,7 +606,7 @@
>  
>      for (int i = 0; i < 32; i++)
>      {
> -        memcpy(&dst[i * stride], &block[i * 32], 32 * sizeof(int16_t));
> +        memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
>      }
>  }
>  
> @@ -632,7 +634,7 @@
>      }
>  }
>  
> -void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int16_t* coef, int num, int per, int shift)
> +void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
>  {
>      X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
>  
> @@ -724,15 +726,15 @@
>  }
>  
>  template<int trSize>
> -uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t stride)
> +uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
>  {
>      uint32_t numSig = 0;
>      for (int k = 0; k < trSize; k++)
>      {
>          for (int j = 0; j < trSize; j++)
>          {
> -            coeff[k * trSize + j] = residual[k * stride + j];
> -            numSig += (residual[k * stride + j] != 0);
> +            coeff[k * trSize + j] = residual[k * resiStride + j];
> +            numSig += (residual[k * resiStride + j] != 0);
>          }
>      }
>  
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/pixel.cpp
> --- a/source/common/pixel.cpp	Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/pixel.cpp	Thu Nov 27 10:12:03 2014 +0900
> @@ -32,32 +32,32 @@
>  
>  using namespace x265;
>  
> -#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
> -    p.FUNC_PREFIX[LUMA_4x4]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4,  4, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_8x8]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8,  8, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_8x4]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8,  4, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_4x8]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4,  8, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_16x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_16x8]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16,  8, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_8x16]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_16x12] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_12x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_16x4]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16,  4, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_4x16]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_32x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_32x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_16x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_32x24] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_24x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_32x8]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32,  8, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_8x32]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_64x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_64x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_32x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_64x48] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_48x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_64x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
> -    p.FUNC_PREFIX[LUMA_16x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
> +#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \
> +    p.FUNC_PREFIX[LUMA_4x4]   = FUNC_PREFIX_DEF<4,  4, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_8x8]   = FUNC_PREFIX_DEF<8,  8, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_8x4]   = FUNC_PREFIX_DEF<8,  4, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_4x8]   = FUNC_PREFIX_DEF<4,  8, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_16x8]  = FUNC_PREFIX_DEF<16,  8, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_8x16]  = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_16x4]  = FUNC_PREFIX_DEF<16,  4, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_4x16]  = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_32x8]  = FUNC_PREFIX_DEF<32,  8, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_8x32]  = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
> +    p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
>  
>  #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
>      p.FUNC_PREFIX[LUMA_4x4]   = FUNC_PREFIX<4,  4>; \
> @@ -491,73 +491,73 @@
>      }
>  }
>  
> -void copy16to16_shl(int16_t *dst, const int16_t *src, intptr_t stride, int shift, int size)
> +template<int size>
> +void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
>  {
> -    X265_CHECK(!(size & 3), "invalid size\n");
> +    X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
> +    X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
> +    X265_CHECK(shift >= 0, "invalid shift\n");
> +
>      for (int i = 0; i < size; i++)
>      {
>          for (int j = 0; j < size; j++)
> -        {
> -            dst[i * size + j] = src[i * stride + j] << shift;
> -        }
> +            dst[j] = src[j] << shift;
> +
> +        src += srcStride;
> +        dst += size;
>      }
>  }
>  
>  template<int size>
> -void convert16to32_shr(int32_t* dst, const int16_t* src, intptr_t stride, int shift, int offset)
> +void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
>  {
> +    X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
> +    X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
> +    X265_CHECK(shift > 0, "invalid shift\n");
> +
> +    int16_t round = 1 << (shift - 1);
>      for (int i = 0; i < size; i++)
>      {
>          for (int j = 0; j < size; j++)
> -        {
> -            dst[i * size + j] = ((int)src[i * stride + j] + offset) >> shift;
> -        }
> +            dst[j] = (src[j] + round) >> shift;
> +
> +        src += srcStride;
> +        dst += size;
>      }
>  }
>  
> -void copy_shr(int16_t* dst, const int16_t* src, intptr_t stride, int shift, int size)
> +template<int size>
> +void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
>  {
> -    int round = 1 << (shift - 1);
> +    X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
> +    X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
> +    X265_CHECK(shift >= 0, "invalid shift\n");
>  
>      for (int i = 0; i < size; i++)
>      {
>          for (int j = 0; j < size; j++)
> -        {
> -            dst[j] = (int16_t)((src[j] + round) >> shift);
> -        }
> +            dst[j] = src[j] << shift;
>  
>          src += size;
> -        dst += stride;
> +        dst += dstStride;
>      }
>  }
>  
>  template<int size>
> -void convert32to16_shl(int16_t* dst, const int32_t* src, intptr_t stride, int shift)
> +void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
>  {
> +    X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
> +    X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
> +    X265_CHECK(shift > 0, "invalid shift\n");
> +
> +    int16_t round = 1 << (shift - 1);
>      for (int i = 0; i < size; i++)
>      {
>          for (int j = 0; j < size; j++)
> -        {
> -            dst[j] = ((int16_t)src[j] << shift);
> -        }
> +            dst[j] = (src[j] + round) >> shift;
>  
>          src += size;
> -        dst += stride;
> -    }
> -}
> -
> -template<int size>
> -void copy_shl(int16_t* dst, const int16_t* src, intptr_t stride, int shift)
> -{
> -    for (int i = 0; i < size; i++)
> -    {
> -        for (int j = 0; j < size; j++)
> -        {
> -            dst[j] = (src[j] << shift);
> -        }
> -
> -        src += size;
> -        dst += stride;
> +        dst += dstStride;
>      }
>  }
>  
> @@ -1263,9 +1263,9 @@
>      CHROMA_444(64, 16);
>      CHROMA_444(16, 64);
>  
> -    SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, pixel, pixel)
> -    SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, pixelcmp_sp_t, int16_t, pixel)
> -    SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, pixelcmp_ss_t, int16_t, int16_t)
> +    SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel)
> +    SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel)
> +    SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t)
>  
>      p.blockfill_s[BLOCK_4x4]   = blockfil_s_c<4>;
>      p.blockfill_s[BLOCK_8x8]   = blockfil_s_c<8>;
> @@ -1273,21 +1273,22 @@
>      p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
>      p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
>  
> -    p.cpy16to16_shl = copy16to16_shl;
> -    p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>;
> -    p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>;
> -    p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>;
> -    p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>;
> -    p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>;
> -    p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>;
> -    p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>;
> -    p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>;
> -
> -    p.copy_shr = copy_shr;
> -    p.copy_shl[BLOCK_4x4] = copy_shl<4>;
> -    p.copy_shl[BLOCK_8x8] = copy_shl<8>;
> -    p.copy_shl[BLOCK_16x16] = copy_shl<16>;
> -    p.copy_shl[BLOCK_32x32] = copy_shl<32>;
> +    p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>;
> +    p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>;
> +    p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>;
> +    p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>;
> +    p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>;
> +    p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>;
> +    p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>;
> +    p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>;
> +    p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>;
> +    p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>;
> +    p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>;
> +    p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>;
> +    p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>;
> +    p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>;
> +    p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>;
> +    p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>;
>  
>      p.sa8d[BLOCK_4x4]   = satd_4x4;
>      p.sa8d[BLOCK_8x8]   = sa8d_8x8;
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/primitives.h
> --- a/source/common/primitives.h	Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/primitives.h	Thu Nov 27 10:12:03 2014 +0900
> @@ -138,32 +138,27 @@
>  typedef int  (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
>  typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
>  typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> -typedef void (*blockcpy_sp_t)(int bx, int by, int16_t* dst, intptr_t dstride, const pixel* src, intptr_t sstride); // dst is aligned
> -typedef void (*blockcpy_sc_t)(int bx, int by, int16_t* dst, intptr_t dstride, const uint8_t* src, intptr_t sstride); // dst is aligned
> -typedef void (*pixelsub_ps_t)(int bx, int by, int16_t* dst, intptr_t dstride, const pixel* src0, const pixel* src1, intptr_t sstride0, intptr_t sstride1);
>  typedef void (*pixelavg_pp_t)(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int weight);
>  typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
>  
>  typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel* refLeft, pixel* refAbove, int dirMode, int bFilter);
>  typedef void (*intra_allangs_t)(pixel* dst, pixel* above0, pixel* left0, pixel* above1, pixel* left1, int bLuma);
>  
> -typedef void (*cpy16to16_shl_t)(int16_t* dst, const int16_t* src, intptr_t, int, int);
> -typedef void (*cvt16to32_shl_t)(int32_t* dst, const int16_t* src, intptr_t, int, int);
> -typedef void (*cvt16to32_shr_t)(int32_t* dst, const int16_t* src, intptr_t, int, int);
> -typedef void (*cvt32to16_shl_t)(int16_t* dst, const int32_t* src, intptr_t, int);
> -typedef uint32_t (*copy_cnt_t)(int16_t* coeff, const int16_t* residual, intptr_t stride);
> -typedef void (*copy_shr_t)(int16_t* dst, const int16_t* src, intptr_t stride, int shift, int size);
> -typedef void (*copy_shl_t)(int16_t* dst, const int16_t* src, intptr_t stride, int shift);
> +typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +typedef void (*cpy1Dto2D_shl_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +typedef void (*cpy1Dto2D_shr_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +typedef uint32_t (*copy_cnt_t)(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
>  
> -typedef void (*dct_t)(const int16_t* src, int16_t* dst, intptr_t stride);
> -typedef void (*idct_t)(const int16_t* src, int16_t* dst, intptr_t stride);
> +typedef void (*dct_t)(const int16_t* src, int16_t* dst, intptr_t srcStride);
> +typedef void (*idct_t)(const int16_t* src, int16_t* dst, intptr_t dstStride);
>  typedef void (*denoiseDct_t)(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff);
>  
>  typedef void (*calcresidual_t)(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
>  typedef void (*transpose_t)(pixel* dst, const pixel* src, intptr_t stride);
> -typedef uint32_t (*quant_t)(const int16_t *coef, const int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> -typedef uint32_t (*nquant_t)(const int16_t *coef, const int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> -typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t*vdequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
> +typedef uint32_t (*quant_t)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
> +typedef uint32_t (*nquant_t)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
> +typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
>  typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
>  typedef int  (*count_nonzero_t)(const int16_t* quantCoeff, int numCoeff);
>  
> @@ -186,7 +181,7 @@
>  typedef void (*filter_hv_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
>  typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
>  
> -typedef void (*copy_pp_t)(pixel* dst, intptr_t dstride, const pixel* src, intptr_t sstride); // dst is aligned
> +typedef void (*copy_pp_t)(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); // dst is aligned
>  typedef void (*copy_sp_t)(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
>  typedef void (*copy_ps_t)(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  typedef void (*copy_ss_t)(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> @@ -195,7 +190,7 @@
>  typedef void (*pixel_add_ps_t)(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
>  typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
>  
> -typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t*  offsetEo, int width, int8_t signLeft);
> +typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
>  typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
>  typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
>  
> @@ -220,12 +215,11 @@
>      pixelcmp_ss_t   psy_cost_ss[NUM_SQUARE_BLOCKS];
>  
>      blockfill_s_t   blockfill_s[NUM_SQUARE_BLOCKS];  // block fill with value
> -    cpy16to16_shl_t cpy16to16_shl;
> -    cvt16to32_shr_t cvt16to32_shr[NUM_SQUARE_BLOCKS - 1];
> -    cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1];
> +    cpy2Dto1D_shl_t cpy2Dto1D_shl[NUM_SQUARE_BLOCKS - 1];
> +    cpy2Dto1D_shr_t cpy2Dto1D_shr[NUM_SQUARE_BLOCKS - 1];
> +    cpy1Dto2D_shl_t cpy1Dto2D_shl[NUM_SQUARE_BLOCKS - 1];
> +    cpy1Dto2D_shr_t cpy1Dto2D_shr[NUM_SQUARE_BLOCKS - 1];
>      copy_cnt_t      copy_cnt[NUM_SQUARE_BLOCKS - 1];
> -    copy_shr_t      copy_shr;
> -    copy_shl_t      copy_shl[NUM_SQUARE_BLOCKS - 1];
>  
>      copy_pp_t       luma_copy_pp[NUM_LUMA_PARTITIONS];
>      copy_sp_t       luma_copy_sp[NUM_LUMA_PARTITIONS];
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/quant.cpp
> --- a/source/common/quant.cpp	Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/quant.cpp	Thu Nov 27 10:12:03 2014 +0900
> @@ -322,49 +322,46 @@
>      return numSig;
>  }
>  
> -uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t stride,
> +uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride,
>                               coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
>  {
> +    const uint32_t sizeIdx = log2TrSize - 2;
>      if (cu.m_tqBypass[absPartIdx])
>      {
>          X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
> -        return primitives.copy_cnt[log2TrSize - 2](coeff, residual, stride);
> +        return primitives.copy_cnt[sizeIdx](coeff, residual, resiStride);
>      }
>  
>      bool isLuma  = ttype == TEXT_LUMA;
>      bool usePsy  = m_psyRdoqScale && isLuma && !useTransformSkip;
>      int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
> -    int trSize = 1 << log2TrSize;
>  
>      X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n");
>      if (useTransformSkip)
>      {
>  #if X265_DEPTH <= 10
> -        primitives.cpy16to16_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
> +        X265_CHECK(transformShift >= 0, "invalid transformShift\n");
> +        primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
>  #else
>          if (transformShift >= 0)
> -            primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
> +            primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
>          else
> -        {
> -            int shift = -transformShift;
> -            int offset = (1 << (shift - 1));
> -            primitives.cvt16to32_shr[log2TrSize - 2](m_resiDctCoeff, residual, stride, shift, offset);
> -        }
> +            primitives.cpy2Dto1D_shr[sizeIdx](m_resiDctCoeff, residual, resiStride, -transformShift);
>  #endif
>      }
>      else
>      {
>          bool isIntra = cu.isIntra(absPartIdx);
> -        const uint32_t sizeIdx = log2TrSize - 2;
>          int useDST = !sizeIdx && isLuma && isIntra;
>          int index = DCT_4x4 + sizeIdx - useDST;
>  
> -        primitives.dct[index](residual, m_resiDctCoeff, stride);
> +        primitives.dct[index](residual, m_resiDctCoeff, resiStride);
>  
>          /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
>           * there is no risk of performing this DCT unnecessarily */
>          if (usePsy)
>          {
> +            int trSize = 1 << log2TrSize;
>              /* perform DCT on source pixels for psy-rdoq */
>              primitives.square_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
>              primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize);
> @@ -408,12 +405,13 @@
>      }
>  }
>  
> -void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, const coeff_t* coeff,
> +void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
>                              uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
>  {
> +    const uint32_t sizeIdx = log2TrSize - 2;
>      if (transQuantBypass)
>      {
> -        primitives.copy_shl[log2TrSize - 2](residual, coeff, stride, 0);
> +        primitives.cpy1Dto2D_shl[sizeIdx](residual, coeff, resiStride, 0);
>          return;
>      }
>  
> @@ -427,7 +425,7 @@
>      if (m_scalingList->m_bEnabled)
>      {
>          int scalingListType = (bIntra ? 0 : 3) + ttype;
> -        const int32_t* dequantCoef = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
> +        const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem];
>          primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
>      }
>      else
> @@ -438,20 +436,18 @@
>  
>      if (useTransformSkip)
>      {
> -        int trSize = 1 << log2TrSize;
> -
>  #if X265_DEPTH <= 10
> -        primitives.copy_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
> +        X265_CHECK(transformShift > 0, "invalid transformShift\n");
> +        primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
>  #else
>          if (transformShift > 0)
> -            primitives.copy_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
> +            primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
>          else
> -            primitives.cvt32to16_shl[log2TrSize - 2](residual, m_resiDctCoeff, stride, -transformShift);
> +            primitives.cpy1Dto2D_shl[sizeIdx](residual, m_resiDctCoeff, resiStride, -transformShift);
>  #endif
>      }
>      else
>      {
> -        const uint32_t sizeIdx = log2TrSize - 2;
>          int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
>  
>          X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n");
> @@ -459,17 +455,17 @@
>          // DC only
>          if (numSig == 1 && coeff[0] != 0 && !useDST)
>          {
> -            const int shift_1st = 7;
> +            const int shift_1st = 7 - 6;
>              const int add_1st = 1 << (shift_1st - 1);
> -            const int shift_2nd = 12 - (X265_DEPTH - 8);
> +            const int shift_2nd = 12 - (X265_DEPTH - 8) - 3;
>              const int add_2nd = 1 << (shift_2nd - 1);
>  
> -            int dc_val = (((m_resiDctCoeff[0] * 64 + add_1st) >> shift_1st) * 64 + add_2nd) >> shift_2nd;
> -            primitives.blockfill_s[sizeIdx](residual, stride, (int16_t)dc_val);
> +            int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
> +            primitives.blockfill_s[sizeIdx](residual, resiStride, (int16_t)dc_val);
>              return;
>          }
>  
> -        primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, stride);
> +        primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, resiStride);
>      }
>  }
>  
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/quant.h
> --- a/source/common/quant.h	Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/quant.h	Thu Nov 27 10:12:03 2014 +0900
> @@ -104,10 +104,10 @@
>      /* CU setup */
>      void setQPforQuant(const CUData& ctu);
>  
> -    uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencstride, const int16_t* residual, uint32_t stride, coeff_t* coeff,
> +    uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
>                            uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip);
>  
> -    void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, const coeff_t* coeff,
> +    void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
>                           uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
>  
>      /* static methods shared with entropy.cpp */
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/x86/asm-primitives.cpp	Thu Nov 27 10:12:03 2014 +0900
> @@ -1336,10 +1336,22 @@
>          p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
>          p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
>  
> -        p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
> -        p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
> -        p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
> -        p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
> +        p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
> +        p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
> +        p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
> +        p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
> +        p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
> +        p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
> +        p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
> +        p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
> +        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
> +        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
> +        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
> +        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
> +        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
> +        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
> +        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
> +        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
>  
>          CHROMA_PIXELSUB_PS(_sse2);
>          CHROMA_PIXELSUB_PS_422(_sse2);
> @@ -1406,10 +1418,6 @@
>          p.quant = x265_quant_sse4;
>          p.nquant = x265_nquant_sse4;
>          p.dequant_normal = x265_dequant_normal_sse4;
> -        p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
> -        p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
> -        p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
> -        p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
>          p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4;
>          p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4;
>          p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4;
> @@ -1438,6 +1446,14 @@
>          p.nquant = x265_nquant_avx2;
>          p.dequant_normal = x265_dequant_normal_avx2;
>          p.scale1D_128to64 = x265_scale1D_128to64_avx2;
> +        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
> +        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
> +        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
> +        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
> +        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
> +        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
> +        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
> +        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
>  #if X86_64
>          p.dct[DCT_8x8] = x265_dct8_avx2;
>          p.dct[DCT_16x16] = x265_dct16_avx2;
> @@ -1548,11 +1564,23 @@
>          p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
>          SA8D_INTER_FROM_BLOCK(sse2);
>  
> -        p.cpy16to16_shl = x265_copy16to16_shl_sse2;
> -        p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
> -        p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
> -        p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
> -        p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
> +        p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
> +        p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
> +        p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
> +        p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
> +        p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
> +        p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
> +        p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
> +        p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
> +        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
> +        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
> +        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
> +        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
> +        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
> +        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
> +        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
> +        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
> +
>          p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
>          p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
>          p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
> @@ -1568,10 +1596,6 @@
>          p.idct[IDST_4x4] = x265_idst4_sse2;
>  
>          p.planecopy_sp = x265_downShift_16_sse2;
> -        p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
> -        p.copy_shl[BLOCK_8x8] = x265_copy_shl_8_sse2;
> -        p.copy_shl[BLOCK_16x16] = x265_copy_shl_16_sse2;
> -        p.copy_shl[BLOCK_32x32] = x265_copy_shl_32_sse2;
>      }
>      if (cpuMask & X265_CPU_SSSE3)
>      {
> @@ -1615,10 +1639,6 @@
>          LUMA_ADDAVG(_sse4);
>          CHROMA_ADDAVG(_sse4);
>          CHROMA_ADDAVG_422(_sse4);
> -        p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
> -        p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
> -        p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
> -        p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
>  
>          // TODO: check POPCNT flag!
>          p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_sse4;
> @@ -1688,7 +1708,6 @@
>          INTRA_ANG_SSE4(sse4);
>  
>          p.dct[DCT_8x8] = x265_dct8_sse4;
> -        p.copy_shr = x265_copy_shr_sse4;
>  //        p.denoiseDct = x265_denoise_dct_sse4;
>      }
>      if (cpuMask & X265_CPU_AVX)
> @@ -1759,10 +1778,14 @@
>          p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
>          p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2;
>  
> -        p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
> -        p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
> -        p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
> -        p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
> +        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
> +        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
> +        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
> +        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
> +        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
> +        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
> +        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
> +        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
>  
>  //        p.denoiseDct = x265_denoise_dct_avx2;
>          p.dct[DCT_4x4] = x265_dct4_avx2;
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/blockcopy8.asm
> --- a/source/common/x86/blockcopy8.asm	Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/x86/blockcopy8.asm	Thu Nov 27 10:12:03 2014 +0900
> @@ -41,7 +41,7 @@
>  SECTION .text
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_2x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_2x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_pp_2x4, 4, 7, 0
> @@ -59,7 +59,7 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_2x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_2x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_pp_2x8, 4, 7, 0
> @@ -97,7 +97,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_2x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_2x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_pp_2x16, 4, 7, 0
> @@ -115,7 +115,7 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_4x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_pp_4x2, 4, 6, 0
> @@ -127,7 +127,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_4x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_pp_4x4, 4, 4, 4
> @@ -145,7 +145,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PP_W4_H8 2
>  INIT_XMM sse2
> @@ -192,7 +192,7 @@
>  BLOCKCOPY_PP_W4_H8 4, 32
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_pp_6x8, 4, 7, 8
> @@ -257,7 +257,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_6x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_6x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_pp_6x16, 4, 7, 2
> @@ -279,7 +279,7 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_8x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_pp_8x2, 4, 4, 2
> @@ -291,7 +291,7 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_8x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_8x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_pp_8x4, 4, 4, 4
> @@ -309,7 +309,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_8x6(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_pp_8x6, 4, 7, 6
> @@ -333,7 +333,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_8x12(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_pp_8x12, 4, 5, 2
> @@ -350,7 +350,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PP_W8_H8 2
>  INIT_XMM sse2
> @@ -397,7 +397,7 @@
>  BLOCKCOPY_PP_W8_H8 8, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PP_W12_H4 2
>  INIT_XMM sse2
> @@ -439,7 +439,7 @@
>  BLOCKCOPY_PP_W12_H4 12, 32
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_16x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PP_W16_H4 2
>  INIT_XMM sse2
> @@ -471,7 +471,7 @@
>  BLOCKCOPY_PP_W16_H4 16, 12
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PP_W16_H8 2
>  INIT_XMM sse2
> @@ -519,7 +519,7 @@
>  BLOCKCOPY_PP_W16_H8 16, 24
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PP_W24_H4 2
>  INIT_XMM sse2
> @@ -560,7 +560,7 @@
>  BLOCKCOPY_PP_W24_H4 24, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PP_W32_H4 2
>  INIT_XMM sse2
> @@ -684,7 +684,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_32x24(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_32x24(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_YMM avx
>  cglobal blockcopy_pp_32x24, 4, 7, 6
> @@ -722,7 +722,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PP_W32_H16_avx 2
>  INIT_YMM avx
> @@ -788,7 +788,7 @@
>  BLOCKCOPY_PP_W32_H16_avx 32, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PP_W48_H2 2
>  INIT_XMM sse2
> @@ -836,7 +836,7 @@
>  BLOCKCOPY_PP_W48_H2 48, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PP_W64_H4 2
>  INIT_XMM sse2
> @@ -897,7 +897,7 @@
>  BLOCKCOPY_PP_W64_H4 64, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal blockcopy_sp_2x4, 4, 5, 2
> @@ -926,7 +926,7 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_2x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal blockcopy_sp_2x8, 4, 5, 2
> @@ -974,11 +974,11 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SP_W2_H2 2
>  INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 7, 2, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 7, 2, dst, dstStride, src, srcStride
>      add         r3,     r3
>      mov         r6d,    %2/2
>  .loop:
> @@ -1003,10 +1003,10 @@
>  BLOCKCOPY_SP_W2_H2 2, 16
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_4x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_4x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal blockcopy_sp_4x2, 4, 4, 2, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_4x2, 4, 4, 2, dst, dstStride, src, srcStride
>  
>  add        r3,        r3
>  
> @@ -1022,10 +1022,10 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_4x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_4x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal blockcopy_sp_4x4, 4, 4, 4, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_4x4, 4, 4, 4, dst, dstStride, src, srcStride
>  
>  add        r3,     r3
>  
> @@ -1049,10 +1049,10 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_4x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_4x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal blockcopy_sp_4x8, 4, 4, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_4x8, 4, 4, 8, dst, dstStride, src, srcStride
>  
>  add        r3,      r3
>  
> @@ -1092,11 +1092,11 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SP_W4_H8 2
>  INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
>  
>  mov         r4d,    %2/8
>  
> @@ -1150,7 +1150,7 @@
>  BLOCKCOPY_SP_W4_H8 4, 32
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_6x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal blockcopy_sp_6x8, 4, 4, 2
> @@ -1213,11 +1213,11 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SP_W6_H2 2
>  INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 7, 4, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
>      add         r3,     r3
>      mov         r6d,    %2/2
>  .loop:
> @@ -1247,10 +1247,10 @@
>  BLOCKCOPY_SP_W6_H2 6, 16
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_8x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal blockcopy_sp_8x2, 4, 4, 2, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_8x2, 4, 4, 2, dst, dstStride, src, srcStride
>  
>  add        r3,         r3
>  
> @@ -1265,10 +1265,10 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_8x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_8x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal blockcopy_sp_8x4, 4, 4, 4, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_8x4, 4, 4, 4, dst, dstStride, src, srcStride
>  
>  add        r3,     r3
>  
> @@ -1290,10 +1290,10 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_8x6(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_8x6(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal blockcopy_sp_8x6, 4, 4, 6, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_8x6, 4, 4, 6, dst, dstStride, src, srcStride
>  
>  add        r3,      r3
>  
> @@ -1322,10 +1322,10 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_8x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_8x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal blockcopy_sp_8x8, 4, 4, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_8x8, 4, 4, 8, dst, dstStride, src, srcStride
>  
>  add        r3,      r3
>  
> @@ -1361,11 +1361,11 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SP_W8_H4 2
>  INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 4, dst, dstStride, src, srcStride
>      add         r3,     r3
>      mov         r4d,    %2/4
>  .loop:
> @@ -1391,11 +1391,11 @@
>  BLOCKCOPY_SP_W8_H4 8, 12
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SP_W8_H8 2
>  INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
>  
>  mov         r4d,    %2/8
>  
> @@ -1446,11 +1446,11 @@
>  BLOCKCOPY_SP_W8_H8 8, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SP_W12_H4 2
>  INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
>  
>  mov             r4d,     %2/4
>  
> @@ -1503,11 +1503,11 @@
>  BLOCKCOPY_SP_W12_H4 12, 32
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SP_W16_H4 2
>  INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
>  
>  mov             r4d,     %2/4
>  
> @@ -1554,11 +1554,11 @@
>  BLOCKCOPY_SP_W16_H4 16, 24
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SP_W24_H2 2
>  INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
>  
>  mov             r4d,     %2/2
>  
> @@ -1595,11 +1595,11 @@
>  BLOCKCOPY_SP_W24_H2 24, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SP_W32_H2 2
>  INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
>  
>  mov             r4d,     %2/2
>  
> @@ -1643,11 +1643,11 @@
>  BLOCKCOPY_SP_W32_H2 32, 48
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SP_W48_H2 2
>  INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
>  
>  mov             r4d,     %2
>  
> @@ -1681,11 +1681,11 @@
>  BLOCKCOPY_SP_W48_H2 48, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SP_W64_H1 2
>  INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
>  
>  mov             r4d,       %2
>  
> @@ -1726,10 +1726,10 @@
>  BLOCKCOPY_SP_W64_H1 64, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockfill_s_4x4(int16_t *dest, intptr_t destride, int16_t val)
> +; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal blockfill_s_4x4, 3, 3, 1, dest, destStride, val
> +cglobal blockfill_s_4x4, 3, 3, 1, dst, dstStride, val
>  
>  add        r1,            r1
>  
> @@ -1745,10 +1745,10 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockfill_s_8x8(int16_t *dest, intptr_t destride, int16_t val)
> +; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal blockfill_s_8x8, 3, 3, 1, dest, destStride, val
> +cglobal blockfill_s_8x8, 3, 3, 1, dst, dstStride, val
>  
>  add        r1,            r1
>  
> @@ -1774,11 +1774,11 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
> +; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKFILL_S_W16_H8 2
>  INIT_XMM sse2
> -cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
> +cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
>  
>  mov        r3d,           %2/8
>  
> @@ -1855,11 +1855,11 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
> +; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKFILL_S_W32_H4 2
>  INIT_XMM sse2
> -cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
> +cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
>  
>  mov        r3d,           %2/4
>  
> @@ -1983,10 +1983,10 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_2x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
> -cglobal blockcopy_ps_2x4, 4, 4, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_2x4, 4, 4, 1, dst, dstStride, src, srcStride
>  
>  add        r1,            r1
>  
> @@ -2013,10 +2013,10 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_2x8(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_2x8(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
> -cglobal blockcopy_ps_2x8, 4, 4, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_2x8, 4, 4, 1, dst, dstStride, src, srcStride
>  
>  add        r1,            r1
>  
> @@ -2065,10 +2065,10 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_2x16(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_2x16(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
> -cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_2x16, 4, 5, 2, dst, dstStride, src, srcStride
>      add         r1,         r1
>      mov         r4d,        16/2
>  .loop:
> @@ -2086,10 +2086,10 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_4x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_4x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
> -cglobal blockcopy_ps_4x2, 4, 4, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_4x2, 4, 4, 1, dst, dstStride, src, srcStride
>  
>  add        r1,         r1
>  
> @@ -2105,10 +2105,10 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_4x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_4x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
> -cglobal blockcopy_ps_4x4, 4, 4, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_4x4, 4, 4, 1, dst, dstStride, src, srcStride
>  
>  add        r1,            r1
>  
> @@ -2135,11 +2135,11 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PS_W4_H4 2
>  INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
>  
>  add     r1,      r1
>  mov    r4d,      %2/4
> @@ -2180,11 +2180,11 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PS_W6_H4 2
>  INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
>  
>  add     r1,      r1
>  mov    r4d,      %2/4
> @@ -2227,10 +2227,10 @@
>  BLOCKCOPY_PS_W6_H4 6, 16
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_8x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_8x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
> -cglobal blockcopy_ps_8x2, 4, 4, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_8x2, 4, 4, 1, dst, dstStride, src, srcStride
>  
>  add        r1,         r1
>  
> @@ -2245,10 +2245,10 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_8x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_8x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
> -cglobal blockcopy_ps_8x4, 4, 4, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_8x4, 4, 4, 1, dst, dstStride, src, srcStride
>  
>  add        r1,            r1
>  
> @@ -2274,10 +2274,10 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_8x6(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_8x6(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
> -cglobal blockcopy_ps_8x6, 4, 4, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_8x6, 4, 4, 1, dst, dstStride, src, srcStride
>  
>  add        r1,            r1
>  
> @@ -2314,11 +2314,11 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PS_W8_H4 2
>  INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
>  
>  add     r1,      r1
>  mov    r4d,      %2/4
> @@ -2361,11 +2361,11 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PS_W12_H2 2
>  INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
>  
>  add        r1,      r1
>  mov        r4d,     %2/2
> @@ -2398,10 +2398,10 @@
>  BLOCKCOPY_PS_W12_H2 12, 32
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_16x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_16x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
> -cglobal blockcopy_ps_16x4, 4, 4, 3, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_16x4, 4, 4, 3, dst, dstStride, src, srcStride
>  
>  add        r1,      r1
>  pxor       m0,      m0
> @@ -2436,11 +2436,11 @@
>  RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PS_W16_H4 2
>  INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
>  
>  add        r1,      r1
>  mov        r4d,     %2/4
> @@ -2492,11 +2492,11 @@
>  BLOCKCOPY_PS_W16_H4 16, 24
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PS_W24_H2 2
>  INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
>  
>  add        r1,      r1
>  mov        r4d,     %2/2
> @@ -2537,11 +2537,11 @@
>  BLOCKCOPY_PS_W24_H2 24, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PS_W32_H2 2
>  INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
>  
>  add        r1,      r1
>  mov        r4d,     %2/2
> @@ -2590,11 +2590,11 @@
>  BLOCKCOPY_PS_W32_H2 32, 48
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PS_W48_H2 2
>  INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
>  
>  add        r1,      r1
>  mov        r4d,     %2/2
> @@ -2649,11 +2649,11 @@
>  BLOCKCOPY_PS_W48_H2 48, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_PS_W64_H2 2
>  INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
>  
>  add        r1,      r1
>  mov        r4d,     %2/2
> @@ -2723,7 +2723,7 @@
>  BLOCKCOPY_PS_W64_H2 64, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_2x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_ss_2x4, 4, 6, 0
> @@ -2746,7 +2746,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_2x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_2x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_ss_2x8, 4, 6, 0
> @@ -2785,7 +2785,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_2x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_2x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_ss_2x16, 4, 7, 0
> @@ -2805,7 +2805,7 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_4x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_4x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_ss_4x2, 4, 4, 2
> @@ -2821,7 +2821,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_4x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_4x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_ss_4x4, 4, 4, 4
> @@ -2841,7 +2841,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SS_W4_H8 2
>  INIT_XMM sse2
> @@ -2889,7 +2889,7 @@
>  BLOCKCOPY_SS_W4_H8 4, 32
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_6x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_6x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_ss_6x8, 4, 4, 4
> @@ -2944,7 +2944,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_6x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_6x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_ss_6x16, 4, 5, 4
> @@ -2968,7 +2968,7 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_8x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_8x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_ss_8x2, 4, 4, 2
> @@ -2984,7 +2984,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_8x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_8x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_ss_8x4, 4, 4, 4
> @@ -3005,7 +3005,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_8x6(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_8x6(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_ss_8x6, 4, 4, 4
> @@ -3034,7 +3034,7 @@
>      RET
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_8x12(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_8x12(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse2
>  cglobal blockcopy_ss_8x12, 4, 5, 2
> @@ -3054,7 +3054,7 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SS_W8_H8 2
>  INIT_XMM sse2
> @@ -3105,7 +3105,7 @@
>  BLOCKCOPY_SS_W8_H8 8, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SS_W12_H4 2
>  INIT_XMM sse2
> @@ -3149,7 +3149,7 @@
>  BLOCKCOPY_SS_W12_H4 12, 32
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_16x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_16x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SS_W16_H4 2
>  INIT_XMM sse2
> @@ -3192,7 +3192,7 @@
>  BLOCKCOPY_SS_W16_H4 16, 12
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SS_W16_H4_avx 2
>  INIT_YMM avx
> @@ -3229,7 +3229,7 @@
>  BLOCKCOPY_SS_W16_H4_avx 16, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SS_W16_H8 2
>  INIT_XMM sse2
> @@ -3302,7 +3302,7 @@
>  BLOCKCOPY_SS_W16_H8 16, 24
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SS_W24_H4 2
>  INIT_XMM sse2
> @@ -3354,7 +3354,7 @@
>  BLOCKCOPY_SS_W24_H4 24, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SS_W32_H4 2
>  INIT_XMM sse2
> @@ -3422,7 +3422,7 @@
>  BLOCKCOPY_SS_W32_H4 32, 48
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SS_W48_H2 2
>  INIT_XMM sse2
> @@ -3500,11 +3500,11 @@
>  BLOCKCOPY_SS_W48_H2 48, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SS_W64_H4 2
>  INIT_XMM sse2
> -cglobal blockcopy_ss_%1x%2, 4, 5, 6, dest, deststride, src, srcstride
> +cglobal blockcopy_ss_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
>      mov     r4d, %2/4
>      add     r1, r1
>      add     r3, r3
> @@ -3606,11 +3606,11 @@
>  BLOCKCOPY_SS_W64_H4 64, 64
>  
>  ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
>  ;-----------------------------------------------------------------------------
>  %macro BLOCKCOPY_SS_W64_H4_avx 2
>  INIT_YMM avx
> -cglobal blockcopy_ss_%1x%2, 4, 7, 4, dest, deststride, src, srcstride
> +cglobal blockcopy_ss_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
>      mov     r4d, %2/4
>      add     r1, r1
>      add     r3, r3
> @@ -3670,152 +3670,82 @@
>  BLOCKCOPY_SS_W64_H4_avx 64, 64
>  
>  ;--------------------------------------------------------------------------------------
> -; void copy16to16_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
> +; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
>  ;--------------------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal copy16to16_shl, 5, 6, 2, dst, src, stride, shift, size
> -%define shift       m1
> -
> -    ; make shift
> -    movd            shift,    r3d
> +cglobal cpy2Dto1D_shr_4, 3, 4, 4
> +    add             r2d, r2d
> +    movd            m0, r3m
> +    pcmpeqw	    m1, m1
> +    psllw           m1, m0
> +    psraw           m1, 1
>  
>      ; register alloc
>      ; r0 - dst
>      ; r1 - src
> -    ; r2 - stride
> -    ; r4 - size
> -
> -    sub             r2d,      r4d
> -    add             r2d,      r2d
> -    mov             r5d,      r4d
> -    shr             r4d,      2
> -.loop_row:
> -    mov             r3d,      r4d
> -
> -.loop_col:
> -    movh            m0,       [r1]
> -    psllw           m0,       shift
> -    movh            [r0],     m0
> -
> -    add             r1,       8
> -    add             r0,       8
> -
> -    dec             r3d
> -    jnz             .loop_col
> -
> -    add             r1,       r2
> -    dec             r5d
> -    jnz             .loop_row
> +    ; r2 - srcStride
> +    ; m0 - shift
> +    ; m1 - word [-round]
> +
> +    ; Row 0-3
> +    movh            m2, [r1]
> +    movhps          m2, [r1 + r2]
> +    lea             r1, [r1 + r2 * 2]
> +    movh            m3, [r1]
> +    movhps          m3, [r1 + r2]
> +    psubw           m2, m1
> +    psubw           m3, m1
> +    psraw           m2, m0
> +    psraw           m3, m0
> +    mova            [r0 + 0 * mmsize], m2
> +    mova            [r0 + 1 * mmsize], m3
>      RET
>  
>  
>  ;--------------------------------------------------------------------------------------
> -; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
> +; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
>  ;--------------------------------------------------------------------------------------
> -INIT_XMM sse4
> -cglobal cvt16to32_shr_4, 3,3,3
> +INIT_XMM sse2
> +cglobal cpy2Dto1D_shr_8, 3, 5, 4
>      add             r2d, r2d
>      movd            m0, r3m
> -    movd            m1, r4m
> -    pshufd          m1, m1, 0
> +    pcmpeqw	    m1, m1
> +    psllw           m1, m0
> +    psraw           m1, 1
> +    mov             r3d, 8/4
> +    lea             r4, [r2 * 3]
>  
>      ; register alloc
>      ; r0 - dst
>      ; r1 - src
> -    ; r2 - stride
> -    ; m0 - shift
> -    ; m1 - dword [offset]
> -
> -    ; Row 0
> -    pmovsxwd        m2, [r1]
> -    paddd           m2, m1
> -    psrad           m2, m0
> -    movu            [r0 + 0 * mmsize], m2
> -
> -    ; Row 1
> -    pmovsxwd        m2, [r1 + r2]
> -    paddd           m2, m1
> -    psrad           m2, m0
> -    movu            [r0 + 1 * mmsize], m2
> -
> -    ; Row 2
> -    lea             r1, [r1 + r2 * 2]
> -    pmovsxwd        m2, [r1]
> -    paddd           m2, m1
> -    psrad           m2, m0
> -    movu            [r0 + 2 * mmsize], m2
> -
> -    ; Row 3
> -    pmovsxwd        m2, [r1 + r2]
> -    paddd           m2, m1
> -    psrad           m2, m0
> -    movu            [r0 + 3 * mmsize], m2
> -    RET
> -
> -
> -;--------------------------------------------------------------------------------------
> -; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
> -;--------------------------------------------------------------------------------------
> -INIT_XMM sse4
> -cglobal cvt16to32_shr_8, 3,5,3
> -    add             r2d, r2d
> -    movd            m0, r3m
> -    movd            m1, r4m
> -    pshufd          m1, m1, 0
> -    mov             r3d, 8/4
> -    lea             r4, [r2 * 3]
> -
> -    ; register alloc
> -    ; r0 - dst
> -    ; r1 - src
> -    ; r2 - stride
> +    ; r2 - srcStride
>      ; r3 - loop counter
>      ; r4 - stride * 3
>      ; m0 - shift
> -    ; m1 - dword [offset]
> +    ; m1 - word [-round]
>  
>  .loop:
> -    ; Row 0
> -    pmovsxwd        m2, [r1]
> -    pmovsxwd        m3, [r1 + mmsize/2]
> -    paddd           m2, m1
> -    paddd           m3, m1
> -    psrad           m2, m0
> -    psrad           m3, m0
> -    movu            [r0 + 0 * mmsize], m2
> -    movu            [r0 + 1 * mmsize], m3
> -
> -    ; Row 1
> -    pmovsxwd        m2, [r1 + r2]
> -    pmovsxwd        m3, [r1 + r2 + mmsize/2]
> -    paddd           m2, m1
> -    paddd           m3, m1
> -    psrad           m2, m0
> -    psrad           m3, m0
> -    movu            [r0 + 2 * mmsize], m2
> -    movu            [r0 + 3 * mmsize], m3
> -
> -    ; Row 2
> -    pmovsxwd        m2, [r1 + r2 * 2]
> -    pmovsxwd        m3, [r1 + r2 * 2 + mmsize/2]
> -    paddd           m2, m1
> -    paddd           m3, m1
> -    psrad           m2, m0
> -    psrad           m3, m0
> -    movu            [r0 + 4 * mmsize], m2
> -    movu            [r0 + 5 * mmsize], m3
> -
> -    ; Row 3
> -    pmovsxwd        m2, [r1 + r4]
> -    pmovsxwd        m3, [r1 + r4 + mmsize/2]
> -    paddd           m2, m1
> -    paddd           m3, m1
> -    psrad           m2, m0
> -    psrad           m3, m0
> -    movu            [r0 + 6 * mmsize], m2
> -    movu            [r0 + 7 * mmsize], m3
> -
> -    add             r0, 8 * mmsize
> +    ; Row 0-1
> +    mova            m2, [r1]
> +    mova            m3, [r1 + r2]
> +    psubw           m2, m1
> +    psubw           m3, m1
> +    psraw           m2, m0
> +    psraw           m3, m0
> +    mova            [r0 + 0 * mmsize], m2
> +    mova            [r0 + 1 * mmsize], m3
> +
> +    ; Row 2-3
> +    mova            m2, [r1 + r2 * 2]
> +    mova            m3, [r1 + r4]
> +    psubw           m2, m1
> +    psubw           m3, m1
> +    psraw           m2, m0
> +    psraw           m3, m0
> +    mova            [r0 + 2 * mmsize], m2
> +    mova            [r0 + 3 * mmsize], m3
> +
> +    add             r0, 4 * mmsize
>      lea             r1, [r1 + r2 * 4]
>      dec             r3d
>      jnz            .loop
> @@ -3823,62 +3753,47 @@
>  
>  
>  ;--------------------------------------------------------------------------------------
> -; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
> +; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
>  ;--------------------------------------------------------------------------------------
> -INIT_XMM sse4
> -cglobal cvt16to32_shr_16, 3,4,6
> +INIT_XMM sse2
> +cglobal cpy2Dto1D_shr_16, 3, 4, 4
>      add             r2d, r2d
>      movd            m0, r3m
> -    movd            m1, r4m
> -    pshufd          m1, m1, 0
> +    pcmpeqw	    m1, m1
> +    psllw           m1, m0
> +    psraw           m1, 1
>      mov             r3d, 16/2
>  
>      ; register alloc
>      ; r0 - dst
>      ; r1 - src
> -    ; r2 - stride
> +    ; r2 - srcStride
>      ; r3 - loop counter
>      ; m0 - shift
> -    ; m1 - dword [offset]
> +    ; m1 - word [-round]
>  
>  .loop:
>      ; Row 0
> -    pmovsxwd        m2, [r1 + 0 * mmsize/2]
> -    pmovsxwd        m3, [r1 + 1 * mmsize/2]
> -    pmovsxwd        m4, [r1 + 2 * mmsize/2]
> -    pmovsxwd        m5, [r1 + 3 * mmsize/2]
> -    paddd           m2, m1
> -    paddd           m3, m1
> -    paddd           m4, m1
> -    paddd           m5, m1
> -    psrad           m2, m0
> -    psrad           m3, m0
> -    psrad           m4, m0
> -    psrad           m5, m0
> -    movu            [r0 + 0 * mmsize], m2
> -    movu            [r0 + 1 * mmsize], m3
> -    movu            [r0 + 2 * mmsize], m4
> -    movu            [r0 + 3 * mmsize], m5
> +    mova            m2, [r1 + 0 * mmsize]
> +    mova            m3, [r1 + 1 * mmsize]
> +    psubw           m2, m1
> +    psubw           m3, m1
> +    psraw           m2, m0
> +    psraw           m3, m0
> +    mova            [r0 + 0 * mmsize], m2
> +    mova            [r0 + 1 * mmsize], m3
>  
>      ; Row 1
> -    pmovsxwd        m2, [r1 + r2 + 0 * mmsize/2]
> -    pmovsxwd        m3, [r1 + r2 +1 * mmsize/2]
> -    pmovsxwd        m4, [r1 + r2 +2 * mmsize/2]
> -    pmovsxwd        m5, [r1 + r2 +3 * mmsize/2]
> -    paddd           m2, m1
> -    paddd           m3, m1
> -    paddd           m4, m1
> -    paddd           m5, m1
> -    psrad           m2, m0
> -    psrad           m3, m0
> -    psrad           m4, m0
> -    psrad           m5, m0
> -    movu            [r0 + 4 * mmsize], m2
> -    movu            [r0 + 5 * mmsize], m3
> -    movu            [r0 + 6 * mmsize], m4
> -    movu            [r0 + 7 * mmsize], m5
> -
> -    add             r0, 8 * mmsize
> +    mova            m2, [r1 + r2 + 0 * mmsize]
> +    mova            m3, [r1 + r2 + 1 * mmsize]
> +    psubw           m2, m1
> +    psubw           m3, m1
> +    psraw           m2, m0
> +    psraw           m3, m0
> +    mova            [r0 + 2 * mmsize], m2
> +    mova            [r0 + 3 * mmsize], m3
> +
> +    add             r0, 4 * mmsize
>      lea             r1, [r1 + r2 * 2]
>      dec             r3d
>      jnz            .loop
> @@ -3886,61 +3801,45 @@
>  
>  
>  ;--------------------------------------------------------------------------------------
> -; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
> +; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
>  ;--------------------------------------------------------------------------------------
> -INIT_XMM sse4
> -cglobal cvt16to32_shr_32, 3,4,6
> +INIT_XMM sse2
> +cglobal cpy2Dto1D_shr_32, 3, 4, 6
>      add             r2d, r2d
>      movd            m0, r3m
> -    movd            m1, r4m
> -    pshufd          m1, m1, 0
> +    pcmpeqw	    m1, m1
> +    psllw           m1, m0
> +    psraw           m1, 1
>      mov             r3d, 32/1
>  
>      ; register alloc
>      ; r0 - dst
>      ; r1 - src
> -    ; r2 - stride
> +    ; r2 - srcStride
>      ; r3 - loop counter
>      ; m0 - shift
> -    ; m1 - dword [offset]
> +    ; m1 - word [-round]
>  
>  .loop:
>      ; Row 0
> -    pmovsxwd        m2, [r1 + 0 * mmsize/2]
> -    pmovsxwd        m3, [r1 + 1 * mmsize/2]
> -    pmovsxwd        m4, [r1 + 2 * mmsize/2]
> -    pmovsxwd        m5, [r1 + 3 * mmsize/2]
> -    paddd           m2, m1
> -    paddd           m3, m1
> -    paddd           m4, m1
> -    paddd           m5, m1
> -    psrad           m2, m0
> -    psrad           m3, m0
> -    psrad           m4, m0
> -    psrad           m5, m0
> -    movu            [r0 + 0 * mmsize], m2
> -    movu            [r0 + 1 * mmsize], m3
> -    movu            [r0 + 2 * mmsize], m4
> -    movu            [r0 + 3 * mmsize], m5
> -
> -    pmovsxwd        m2, [r1 + 4 * mmsize/2]
> -    pmovsxwd        m3, [r1 + 5 * mmsize/2]
> -    pmovsxwd        m4, [r1 + 6 * mmsize/2]
> -    pmovsxwd        m5, [r1 + 7 * mmsize/2]
> -    paddd           m2, m1
> -    paddd           m3, m1
> -    paddd           m4, m1
> -    paddd           m5, m1
> -    psrad           m2, m0
> -    psrad           m3, m0
> -    psrad           m4, m0
> -    psrad           m5, m0
> -    movu            [r0 + 4 * mmsize], m2
> -    movu            [r0 + 5 * mmsize], m3
> -    movu            [r0 + 6 * mmsize], m4
> -    movu            [r0 + 7 * mmsize], m5
> -
> -    add             r0, 8 * mmsize
> +    mova            m2, [r1 + 0 * mmsize]
> +    mova            m3, [r1 + 1 * mmsize]
> +    mova            m4, [r1 + 2 * mmsize]
> +    mova            m5, [r1 + 3 * mmsize]
> +    psubw           m2, m1
> +    psubw           m3, m1
> +    psubw           m4, m1
> +    psubw           m5, m1
> +    psraw           m2, m0
> +    psraw           m3, m0
> +    psraw           m4, m0
> +    psraw           m5, m0
> +    mova            [r0 + 0 * mmsize], m2
> +    mova            [r0 + 1 * mmsize], m3
> +    mova            [r0 + 2 * mmsize], m4
> +    mova            [r0 + 3 * mmsize], m5
> +
> +    add             r0, 4 * mmsize
>      add             r1, r2
>      dec             r3d
>      jnz            .loop
> @@ -3948,58 +3847,239 @@
>  
>  
>  ;--------------------------------------------------------------------------------------
> -; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
> +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
>  ;--------------------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal cvt32to16_shl_4, 3,3,5
> +cglobal cpy1Dto2D_shl_4, 3, 3, 3
>      add         r2d, r2d
>      movd        m0, r3m
>  
>      ; Row 0-3
> +    mova        m1, [r1 + 0 * mmsize]
> +    mova        m2, [r1 + 1 * mmsize]
> +    psllw       m1, m0
> +    psllw       m2, m0
> +    movh        [r0], m1
> +    movhps      [r0 + r2], m1
> +    movh        [r0 + r2 * 2], m2
> +    lea         r2, [r2 * 3]
> +    movhps      [r0 + r2], m2
> +    RET
> +
> +
> +INIT_YMM avx2
> +cglobal cpy1Dto2D_shl_4, 3, 3, 2
> +    add         r2d, r2d
> +    movd        xm0, r3m
> +
> +    ; Row 0-3
> +    movu        m1, [r1]
> +    psllw       m1, xm0
> +    vextracti128 xm0, m1, 1
> +    movq        [r0], xm1
> +    movhps      [r0 + r2], xm1
> +    lea         r0, [r0 + r2 * 2]
> +    movq        [r0], xm0
> +    movhps      [r0 + r2], xm0
> +    RET
> +
> +
> +;--------------------------------------------------------------------------------------
> +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
> +;--------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cpy1Dto2D_shl_8, 3, 4, 5
> +    add         r2d, r2d
> +    movd        m0, r3m
> +    lea         r3, [r2 * 3]
> +
> +    ; Row 0-3
> +    mova        m1, [r1 + 0 * mmsize]
> +    mova        m2, [r1 + 1 * mmsize]
> +    mova        m3, [r1 + 2 * mmsize]
> +    mova        m4, [r1 + 3 * mmsize]
> +    psllw       m1, m0
> +    psllw       m2, m0
> +    psllw       m3, m0
> +    psllw       m4, m0
> +    mova        [r0], m1
> +    mova        [r0 + r2], m2
> +    mova        [r0 + r2 * 2], m3
> +    mova        [r0 + r3], m4
> +    lea         r0, [r0 + r2 * 4]
> +
> +    ; Row 4-7
> +    mova        m1, [r1 + 4 * mmsize]
> +    mova        m2, [r1 + 5 * mmsize]
> +    mova        m3, [r1 + 6 * mmsize]
> +    mova        m4, [r1 + 7 * mmsize]
> +    psllw       m1, m0
> +    psllw       m2, m0
> +    psllw       m3, m0
> +    psllw       m4, m0
> +    mova        [r0], m1
> +    mova        [r0 + r2], m2
> +    mova        [r0 + r2 * 2], m3
> +    mova        [r0 + r3], m4
> +    RET
> +
> +
> +INIT_YMM avx2
> +cglobal cpy1Dto2D_shl_8, 3, 4, 3
> +    add         r2d, r2d
> +    movd        xm0, r3m
> +    lea         r3, [r2 * 3]
> +
> +    ; Row 0-3
>      movu        m1, [r1 + 0 * mmsize]
>      movu        m2, [r1 + 1 * mmsize]
> -    movu        m3, [r1 + 2 * mmsize]
> -    movu        m4, [r1 + 3 * mmsize]
> -    packssdw    m1, m2
> -    packssdw    m3, m4
> +    psllw       m1, xm0
> +    psllw       m2, xm0
> +    movu        [r0], xm1
> +    vextracti128 [r0 + r2], m1, 1
> +    movu        [r0 + r2 * 2], xm2
> +    vextracti128 [r0 + r3], m2, 1
> +
> +    ; Row 4-7
> +    movu        m1, [r1 + 2 * mmsize]
> +    movu        m2, [r1 + 3 * mmsize]
> +    lea         r0, [r0 + r2 * 4]
> +    psllw       m1, xm0
> +    psllw       m2, xm0
> +    movu        [r0], xm1
> +    vextracti128 [r0 + r2], m1, 1
> +    movu        [r0 + r2 * 2], xm2
> +    vextracti128 [r0 + r3], m2, 1
> +    RET
> +
> +
> +;--------------------------------------------------------------------------------------
> +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
> +;--------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cpy1Dto2D_shl_16, 3, 4, 5
> +    add         r2d, r2d
> +    movd        m0, r3m
> +    mov         r3d, 16/4
> +
> +.loop:
> +    ; Row 0-1
> +    mova        m1, [r1 + 0 * mmsize]
> +    mova        m2, [r1 + 1 * mmsize]
> +    mova        m3, [r1 + 2 * mmsize]
> +    mova        m4, [r1 + 3 * mmsize]
>      psllw       m1, m0
> +    psllw       m2, m0
>      psllw       m3, m0
> -    movh        [r0], m1
> -    movhps      [r0 + r2], m1
> -    movh        [r0 + r2 * 2], m3
> -    lea         r2, [r2 * 3]
> -    movhps      [r0 + r2], m3
> +    psllw       m4, m0
> +    mova        [r0], m1
> +    mova        [r0 + 16], m2
> +    mova        [r0 + r2], m3
> +    mova        [r0 + r2 + 16], m4
> +
> +    ; Row 2-3
> +    mova        m1, [r1 + 4 * mmsize]
> +    mova        m2, [r1 + 5 * mmsize]
> +    mova        m3, [r1 + 6 * mmsize]
> +    mova        m4, [r1 + 7 * mmsize]
> +    lea         r0, [r0 + r2 * 2]
> +    psllw       m1, m0
> +    psllw       m2, m0
> +    psllw       m3, m0
> +    psllw       m4, m0
> +    mova        [r0], m1
> +    mova        [r0 + 16], m2
> +    mova        [r0 + r2], m3
> +    mova        [r0 + r2 + 16], m4
> +
> +    add         r1, 8 * mmsize
> +    lea         r0, [r0 + r2 * 2]
> +    dec         r3d
> +    jnz        .loop
>      RET
>  
>  
>  INIT_YMM avx2
> -cglobal cvt32to16_shl_4, 3,3,3
> +cglobal cpy1Dto2D_shl_16, 3, 5, 3
>      add         r2d, r2d
>      movd        xm0, r3m
> -
> -    ; Row 0-3
> +    mov         r3d, 16/4
> +    lea         r4, [r2 * 3]
> +
> +.loop:
> +    ; Row 0-1
>      movu        m1, [r1 + 0 * mmsize]
>      movu        m2, [r1 + 1 * mmsize]
> -    packssdw    m1, m2
>      psllw       m1, xm0
> -    vextracti128 xm0, m1, 1
> -    movq        [r0], xm1
> -    movq        [r0 + r2], xm0
> -    lea         r0, [r0 + r2 * 2]
> -    movhps      [r0], xm1
> -    movhps      [r0 + r2], xm0
> +    psllw       m2, xm0
> +    movu        [r0], m1
> +    movu        [r0 + r2], m2
> +
> +    ; Row 2-3
> +    movu        m1, [r1 + 2 * mmsize]
> +    movu        m2, [r1 + 3 * mmsize]
> +    psllw       m1, xm0
> +    psllw       m2, xm0
> +    movu        [r0 + r2 * 2], m1
> +    movu        [r0 + r4], m2
> +
> +    add         r1, 4 * mmsize
> +    lea         r0, [r0 + r2 * 4]
> +    dec         r3d
> +    jnz        .loop
>      RET
>  
>  
>  ;--------------------------------------------------------------------------------------
> -; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
> +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
>  ;--------------------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal cvt32to16_shl_8, 3,5,5
> +cglobal cpy1Dto2D_shl_32, 3, 4, 5
>      add         r2d, r2d
>      movd        m0, r3m
> -    mov         r3d, 8/4
> -    lea         r4, [r2 * 3]
> +    mov         r3d, 32/2
> +
> +.loop:
> +    ; Row 0
> +    mova        m1, [r1 + 0 * mmsize]
> +    mova        m2, [r1 + 1 * mmsize]
> +    mova        m3, [r1 + 2 * mmsize]
> +    mova        m4, [r1 + 3 * mmsize]
> +    psllw       m1, m0
> +    psllw       m2, m0
> +    psllw       m3, m0
> +    psllw       m4, m0
> +    mova        [r0 + 0 * mmsize], m1
> +    mova        [r0 + 1 * mmsize], m2
> +    mova        [r0 + 2 * mmsize], m3
> +    mova        [r0 + 3 * mmsize], m4
> +
> +    ; Row 1
> +    mova        m1, [r1 + 4 * mmsize]
> +    mova        m2, [r1 + 5 * mmsize]
> +    mova        m3, [r1 + 6 * mmsize]
> +    mova        m4, [r1 + 7 * mmsize]
> +    psllw       m1, m0
> +    psllw       m2, m0
> +    psllw       m3, m0
> +    psllw       m4, m0
> +    mova        [r0 + r2 + 0 * mmsize], m1
> +    mova        [r0 + r2 + 1 * mmsize], m2
> +    mova        [r0 + r2 + 2 * mmsize], m3
> +    mova        [r0 + r2 + 3 * mmsize], m4
> +
> +    add         r1, 8 * mmsize
> +    lea         r0, [r0 + r2 * 2]
> +    dec         r3d
> +    jnz        .loop
> +    RET
> +
> +
> +INIT_YMM avx2
> +cglobal cpy1Dto2D_shl_32, 3, 4, 5
> +    add         r2d, r2d
> +    movd        xm0, r3m
> +    mov         r3d, 32/2
>  
>  .loop:
>      ; Row 0-1
> @@ -4007,252 +4087,14 @@
>      movu        m2, [r1 + 1 * mmsize]
>      movu        m3, [r1 + 2 * mmsize]
>      movu        m4, [r1 + 3 * mmsize]
> -    packssdw    m1, m2
> -    packssdw    m3, m4
> -    psllw       m1, m0
> -    psllw       m3, m0
> +    psllw       m1, xm0
> +    psllw       m2, xm0
> +    psllw       m3, xm0
> +    psllw       m4, xm0
>      movu        [r0], m1
> +    movu        [r0 + mmsize], m2
>      movu        [r0 + r2], m3
> -
> -    ; Row 2-3
> -    movu        m1, [r1 + 4 * mmsize]
> -    movu        m2, [r1 + 5 * mmsize]
> -    movu        m3, [r1 + 6 * mmsize]
> -    movu        m4, [r1 + 7 * mmsize]
> -    packssdw    m1, m2
> -    packssdw    m3, m4
> -    psllw       m1, m0
> -    psllw       m3, m0
> -    movu        [r0 + r2 * 2], m1
> -    movu        [r0 + r4], m3
> -
> -    add         r1, 8 * mmsize
> -    lea         r0, [r0 + r2 * 4]
> -    dec         r3d
> -    jnz        .loop
> -    RET
> -
> -
> -INIT_YMM avx2
> -cglobal cvt32to16_shl_8, 3,4,3
> -    add         r2d, r2d
> -    movd        xm0, r3m
> -    lea         r3, [r2 * 3]
> -
> -    ; Row 0-1
> -    movu        xm1, [r1 + 0 * mmsize]
> -    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
> -    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
> -    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
> -    packssdw    m1, m2
> -    psllw       m1, xm0
> -    movu        [r0], xm1
> -    vextracti128 [r0 + r2], m1, 1
> -
> -    ; Row 2-3
> -    movu        xm1, [r1 + 2 * mmsize]
> -    vinserti128  m1, m1, [r1 + 3 * mmsize], 1
> -    movu        xm2, [r1 + 2 * mmsize + mmsize/2]
> -    vinserti128  m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
> -    packssdw    m1, m2
> -    psllw       m1, xm0
> -    movu        [r0 + r2 * 2], xm1
> -    vextracti128 [r0 + r3], m1, 1
> -
> -    add         r1, 4 * mmsize
> -    lea         r0, [r0 + r2 * 4]
> -
> -    ; Row 4-5
> -    movu        m1, [r1 + 0 * mmsize]
> -    movu        m2, [r1 + 1 * mmsize]
> -    packssdw    m1, m2
> -    vpermq      m1, m1, 11011000b
> -    psllw       m1, xm0
> -    movu        [r0], xm1
> -    vextracti128 [r0 + r2], m1, 1
> -
> -    ; Row 6-7
> -    movu        m1, [r1 + 2 * mmsize]
> -    movu        m2, [r1 + 3 * mmsize]
> -    packssdw    m1, m2
> -    vpermq      m1, m1, 11011000b
> -    psllw       m1, xm0
> -    movu        [r0 + r2 * 2], xm1
> -    vextracti128 [r0 + r3], m1, 1
> -    RET
> -
> -;--------------------------------------------------------------------------------------
> -; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
> -;--------------------------------------------------------------------------------------
> -INIT_XMM sse2
> -cglobal cvt32to16_shl_16, 3,4,5
> -    add         r2d, r2d
> -    movd        m0, r3m
> -    mov         r3d, 16/2
> -
> -.loop:
> -    ; Row 0
> -    movu        m1, [r1 + 0 * mmsize]
> -    movu        m2, [r1 + 1 * mmsize]
> -    movu        m3, [r1 + 2 * mmsize]
> -    movu        m4, [r1 + 3 * mmsize]
> -    packssdw    m1, m2
> -    packssdw    m3, m4
> -    psllw       m1, m0
> -    psllw       m3, m0
> -    movu        [r0], m1
> -    movu        [r0 + mmsize], m3
> -
> -    ; Row 1
> -    movu        m1, [r1 + 4 * mmsize]
> -    movu        m2, [r1 + 5 * mmsize]
> -    movu        m3, [r1 + 6 * mmsize]
> -    movu        m4, [r1 + 7 * mmsize]
> -    packssdw    m1, m2
> -    packssdw    m3, m4
> -    psllw       m1, m0
> -    psllw       m3, m0
> -    movu        [r0 + r2], m1
> -    movu        [r0 + r2 + mmsize], m3
> -
> -    add         r1, 8 * mmsize
> -    lea         r0, [r0 + r2 * 2]
> -    dec         r3d
> -    jnz        .loop
> -    RET
> -
> -
> -INIT_YMM avx2
> -cglobal cvt32to16_shl_16, 3,5,3
> -    add         r2d, r2d
> -    movd        xm0, r3m
> -    mov         r3d, 16/4
> -    lea         r4, [r2 * 3]
> -
> -.loop:
> -    ; Row 0
> -    movu        xm1, [r1 + 0 * mmsize]
> -    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
> -    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
> -    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
> -    packssdw    m1, m2
> -    psllw       m1, xm0
> -    movu        [r0], m1
> -
> -    ; Row 1
> -    movu        xm1, [r1 + 2 * mmsize]
> -    vinserti128  m1, m1, [r1 + 3 * mmsize], 1
> -    movu        xm2, [r1 + 2 * mmsize + mmsize/2]
> -    vinserti128  m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
> -    packssdw    m1, m2
> -    psllw       m1, xm0
> -    movu        [r0 + r2], m1
> -
> -    add         r1, 4 * mmsize
> -
> -    ; Row 2
> -    movu        xm1, [r1 + 0 * mmsize]
> -    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
> -    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
> -    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
> -    packssdw    m1, m2
> -    psllw       m1, xm0
> -    movu        [r0 + r2 * 2], m1
> -
> -    ; Row 3
> -    movu        m1, [r1 + 2 * mmsize]
> -    movu        m2, [r1 + 3 * mmsize]
> -    packssdw    m1, m2
> -    psllw       m1, xm0
> -    vpermq      m1, m1, 11011000b
> -    movu        [r0 + r4], m1
> -
> -    add         r1, 4 * mmsize
> -    lea         r0, [r0 + r2 * 4]
> -    dec         r3d
> -    jnz        .loop
> -    RET
> -
> -
> -;--------------------------------------------------------------------------------------
> -; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
> -;--------------------------------------------------------------------------------------
> -INIT_XMM sse2
> -cglobal cvt32to16_shl_32, 3,4,5
> -    add         r2d, r2d
> -    movd        m0, r3m
> -    mov         r3d, 32/1
> -
> -.loop:
> -    ; Row 0
> -    movu        m1, [r1 + 0 * mmsize]
> -    movu        m2, [r1 + 1 * mmsize]
> -    movu        m3, [r1 + 2 * mmsize]
> -    movu        m4, [r1 + 3 * mmsize]
> -    packssdw    m1, m2
> -    packssdw    m3, m4
> -    psllw       m1, m0
> -    psllw       m3, m0
> -    movu        [r0 + 0 * mmsize], m1
> -    movu        [r0 + 1 * mmsize], m3
> -
> -    movu        m1, [r1 + 4 * mmsize]
> -    movu        m2, [r1 + 5 * mmsize]
> -    movu        m3, [r1 + 6 * mmsize]
> -    movu        m4, [r1 + 7 * mmsize]
> -    packssdw    m1, m2
> -    packssdw    m3, m4
> -    psllw       m1, m0
> -    psllw       m3, m0
> -    movu        [r0 + 2 * mmsize], m1
> -    movu        [r0 + 3 * mmsize], m3
> -
> -    add         r1, 8 * mmsize
> -    add         r0, r2
> -    dec         r3d
> -    jnz        .loop
> -    RET
> -
> -
> -INIT_YMM avx2
> -cglobal cvt32to16_shl_32, 3,4,5
> -    add         r2d, r2d
> -    movd        xm0, r3m
> -    mov         r3d, 32/2
> -
> -.loop:
> -    ; Row 0
> -    movu        xm1, [r1 + 0 * mmsize]
> -    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
> -    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
> -    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
> -    movu        xm3, [r1 + 2 * mmsize]
> -    vinserti128  m3, m3, [r1 + 3 * mmsize], 1
> -    movu        xm4, [r1 + 2 * mmsize + mmsize/2]
> -    vinserti128  m4, m4, [r1 + 3 * mmsize + mmsize/2], 1
> -    packssdw    m1, m2
> -    packssdw    m3, m4
> -    psllw       m1, xm0
> -    psllw       m3, xm0
> -    movu        [r0], m1
> -    movu        [r0 + mmsize], m3
> -
> -    add         r1, 4 * mmsize
> -
> -    ; Row 1
> -    movu        xm1, [r1 + 0 * mmsize]
> -    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
> -    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
> -    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
> -    movu        m3, [r1 + 2 * mmsize]
> -    movu        m4, [r1 + 3 * mmsize]
> -    packssdw    m1, m2
> -    packssdw    m3, m4
> -    psllw       m1, xm0
> -    psllw       m3, xm0
> -    vpermq      m3, m3, 11011000b
> -    movu        [r0 + r2], m1
> -    movu        [r0 + r2 + mmsize], m3
> +    movu        [r0 + r2 + mmsize], m4
>  
>      add         r1, 4 * mmsize
>      lea         r0, [r0 + r2 * 2]
> @@ -4262,7 +4104,7 @@
>  
>  
>  ;--------------------------------------------------------------------------------------
> -; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
> +; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
>  ;--------------------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal copy_cnt_4, 3,3,3
> @@ -4301,7 +4143,7 @@
>  
>  
>  ;--------------------------------------------------------------------------------------
> -; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
> +; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
>  ;--------------------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal copy_cnt_8, 3,3,6
> @@ -4405,7 +4247,7 @@
>  
>  
>  ;--------------------------------------------------------------------------------------
> -; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
> +; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
>  ;--------------------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal copy_cnt_16, 3,4,6
> @@ -4516,7 +4358,7 @@
>      RET
>  
>  ;--------------------------------------------------------------------------------------
> -; uint32_t copy_cnt(int32_t *dst, int16_t *src, intptr_t stride);
> +; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
>  ;--------------------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal copy_cnt_32, 3,4,6
> @@ -4623,180 +4465,432 @@
>      movd         eax, xm4
>      RET
>  
> -;-----------------------------------------------------------------------------
> -; void copy_shr(short *dst, short *src, intptr_t stride, int shift, int size)
> -;-----------------------------------------------------------------------------
> -
> -INIT_XMM sse4
> -cglobal copy_shr, 4, 7, 4, dst, src, stride
> -%define rnd     m2
> -%define shift   m1
> -
> -    ; make shift
> -    mov         r5d, r3m
> -    movd        shift, r5d
> -
> -    ; make round
> -    dec         r5
> -    xor         r6, r6
> -    bts         r6, r5
> -
> -    movd        rnd, r6d
> -    pshufd      rnd, rnd, 0
> +
> +;--------------------------------------------------------------------------------------
> +; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +;--------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cpy2Dto1D_shl_4, 4, 4, 4
> +    add             r2d, r2d
> +    movd            m0, r3d
>  
>      ; register alloc
>      ; r0 - dst
>      ; r1 - src
> -    ; r2 - stride * 2 (short*)
> -    ; r3 - lx
> -    ; r4 - size
> -    ; r5 - ly
> -    ; r6 - diff
> -    add         r2d, r2d
> -
> -    mov         r4d, r4m
> -    mov         r5, r4 ; size
> -    mov         r6, r2 ; stride
> -    sub         r6, r4
> -    add         r6, r6
> -
> -    shr         r5, 1
> -.loop_row:
> -
> -    mov         r3, r4
> -    shr         r3, 2
> -.loop_col:
> -    ; row 0
> -    movh        m3, [r1]
> -    pmovsxwd    m0, m3
> -    paddd       m0, rnd
> -    psrad       m0, shift
> -    packssdw    m0, m0
> -    movh        [r0], m0
> -
> -    ; row 1
> -    movh        m3, [r1 + r4 * 2]
> -    pmovsxwd    m0, m3
> -    paddd       m0, rnd
> -    psrad       m0, shift
> -    packssdw    m0, m0
> -    movh        [r0 + r2], m0
> -
> -    ; move col pointer
> -    add         r1, 8
> -    add         r0, 8
> -
> -    dec         r3
> -    jg          .loop_col
> -
> -    ; update pointer
> -    lea         r1, [r1 + r4 * 2]
> -    add         r0, r6
> -
> -    ; end of loop_row
> -    dec         r5
> -    jg         .loop_row
> +    ; r2 - srcStride
> +    ; m0 - shift
> +
> +    ; Row 0-3
> +    movh            m2, [r1]
> +    movhps          m2, [r1 + r2]
> +    lea             r1, [r1 + r2 * 2]
> +    movh            m3, [r1]
> +    movhps          m3, [r1 + r2]
> +    psllw           m2, m0
> +    psllw           m3, m0
> +    mova            [r0 + 0 * mmsize], m2
> +    mova            [r0 + 1 * mmsize], m3
>  
>      RET
>  
> +
>  ;--------------------------------------------------------------------------------------
> -; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
> +; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
>  ;--------------------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal copy_shl_4, 3,3,3
> +cglobal cpy2Dto1D_shl_8, 4, 5, 4
> +    add             r2d, r2d
> +    movd            m0, r3d
> +    mov             r3d, 8/4
> +    lea             r4, [r2 * 3]
> +
> +    ; register alloc
> +    ; r0 - dst
> +    ; r1 - src
> +    ; r2 - srcStride
> +    ; r3 - loop counter
> +    ; r4 - stride * 3
> +    ; m0 - shift
> +
> +.loop:
> +    ; Row 0, 1
> +    mova            m2, [r1]
> +    mova            m3, [r1 + r2]
> +    psllw           m2, m0
> +    psllw           m3, m0
> +    mova            [r0 + 0 * mmsize], m2
> +    mova            [r0 + 1 * mmsize], m3
> +
> +    ; Row 2, 3
> +    mova            m2, [r1 + r2 * 2]
> +    mova            m3, [r1 + r4]
> +    psllw           m2, m0
> +    psllw           m3, m0
> +    mova            [r0 + 2 * mmsize], m2
> +    mova            [r0 + 3 * mmsize], m3
> +
> +    add             r0, 4 * mmsize
> +    lea             r1, [r1 + r2 * 4]
> +    dec             r3d
> +    jnz            .loop
> +    RET
> +
> +
> +;--------------------------------------------------------------------------------------
> +; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +;--------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cpy2Dto1D_shl_16, 4, 4, 4
> +    add             r2d, r2d
> +    movd            m0, r3d
> +    mov             r3d, 16/2
> +
> +    ; register alloc
> +    ; r0 - dst
> +    ; r1 - src
> +    ; r2 - srcStride
> +    ; r3 - loop counter
> +    ; m0 - shift
> +
> +.loop:
> +    ; Row 0
> +    mova            m2, [r1 + 0 * mmsize]
> +    mova            m3, [r1 + 1 * mmsize]
> +    psllw           m2, m0
> +    psllw           m3, m0
> +    mova            [r0 + 0 * mmsize], m2
> +    mova            [r0 + 1 * mmsize], m3
> +
> +    ; Row 1
> +    mova            m2, [r1 + r2 + 0 * mmsize]
> +    mova            m3, [r1 + r2 + 1 * mmsize]
> +    psllw           m2, m0
> +    psllw           m3, m0
> +    mova            [r0 + 2 * mmsize], m2
> +    mova            [r0 + 3 * mmsize], m3
> +
> +    add             r0, 4 * mmsize
> +    lea             r1, [r1 + r2 * 2]
> +    dec             r3d
> +    jnz            .loop
> +    RET
> +
> +
> +;--------------------------------------------------------------------------------------
> +; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +;--------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cpy2Dto1D_shl_32, 4, 4, 6
> +    add             r2d, r2d
> +    movd            m0, r3d
> +    mov             r3d, 32/1
> +
> +    ; register alloc
> +    ; r0 - dst
> +    ; r1 - src
> +    ; r2 - srcStride
> +    ; r3 - loop counter
> +    ; m0 - shift
> +
> +.loop:
> +    ; Row 0
> +    mova            m2, [r1 + 0 * mmsize]
> +    mova            m3, [r1 + 1 * mmsize]
> +    mova            m4, [r1 + 2 * mmsize]
> +    mova            m5, [r1 + 3 * mmsize]
> +    psllw           m2, m0
> +    psllw           m3, m0
> +    psllw           m4, m0
> +    psllw           m5, m0
> +    mova            [r0 + 0 * mmsize], m2
> +    mova            [r0 + 1 * mmsize], m3
> +    mova            [r0 + 2 * mmsize], m4
> +    mova            [r0 + 3 * mmsize], m5
> +
> +    add             r0, 4 * mmsize
> +    add             r1, r2
> +    dec             r3d
> +    jnz            .loop
> +    RET
> +
> +
> +;--------------------------------------------------------------------------------------
> +; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
> +;--------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cpy1Dto2D_shr_4, 3, 3, 4
>      add         r2d, r2d
>      movd        m0, r3m
> +    pcmpeqw	m1, m1
> +    psllw       m1, m0
> +    psraw       m1, 1
>  
>      ; Row 0-3
> -    movu        m1, [r1 + 0 * mmsize]
> -    movu        m2, [r1 + 1 * mmsize]
> -    psllw       m1, m0
> -    psllw       m2, m0
> -    movh        [r0], m1
> -    movhps      [r0 + r2], m1
> -    movh        [r0 + r2 * 2], m2
> +    mova        m2, [r1 + 0 * mmsize]
> +    mova        m3, [r1 + 1 * mmsize]
> +    psubw       m2, m1
> +    psubw       m3, m1
> +    psraw       m2, m0
> +    psraw       m3, m0
> +    movh        [r0], m2
> +    movhps      [r0 + r2], m2
> +    movh        [r0 + r2 * 2], m3
>      lea         r2, [r2 * 3]
> -    movhps      [r0 + r2], m2
> +    movhps      [r0 + r2], m3
>      RET
>  
> +
> +INIT_YMM avx2
> +cglobal cpy1Dto2D_shr_4, 3, 3, 3
> +    add         r2d, r2d
> +    movd        xm0, r3m
> +    pcmpeqw	m1, m1
> +    psllw       m1, xm0
> +    psraw       m1, 1
> +
> +    ; Row 0-3
> +    movu        m2, [r1]
> +    psubw       m2, m1
> +    psraw       m2, xm0
> +    vextracti128 xm1, m2, 1
> +    movq        [r0], xm2
> +    movhps      [r0 + r2], xm2
> +    lea         r0, [r0 + r2 * 2]
> +    movq        [r0], xm1
> +    movhps      [r0 + r2], xm1
> +    RET
> +
> +
>  ;--------------------------------------------------------------------------------------
> -; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
> +; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
>  ;--------------------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal copy_shl_8, 3,4,5
> +cglobal cpy1Dto2D_shr_8, 3, 4, 6
>      add         r2d, r2d
>      movd        m0, r3m
> +    pcmpeqw	m1, m1
> +    psllw       m1, m0
> +    psraw       m1, 1
> +    lea         r3, [r2 * 3]
>  
>      ; Row 0-3
> -    movu        m1, [r1 + 0 * mmsize]
> -    movu        m2, [r1 + 1 * mmsize]
> -    movu        m3, [r1 + 2 * mmsize]
> -    movu        m4, [r1 + 3 * mmsize]
> -    psllw       m1, m0
> -    psllw       m2, m0
> -    psllw       m3, m0
> -    psllw       m4, m0
> -    movu        [r0], m1
> -    movu        [r0 + r2], m2
> -    movu        [r0 + 2 * r2], m3
> -    lea         r0, [r0 + 2 * r2]
> -    movu        [r0 + r2], m4
> +    mova        m2, [r1 + 0 * mmsize]
> +    mova        m3, [r1 + 1 * mmsize]
> +    mova        m4, [r1 + 2 * mmsize]
> +    mova        m5, [r1 + 3 * mmsize]
> +    psubw       m2, m1
> +    psubw       m3, m1
> +    psubw       m4, m1
> +    psubw       m5, m1
> +    psraw       m2, m0
> +    psraw       m3, m0
> +    psraw       m4, m0
> +    psraw       m5, m0
> +    mova        [r0], m2
> +    mova        [r0 + r2], m3
> +    mova        [r0 + r2 * 2], m4
> +    mova        [r0 + r3], m5
>  
>      ; Row 4-7
> -    movu        m1, [r1 + 4 * mmsize]
> -    movu        m2, [r1 + 5 * mmsize]
> -    movu        m3, [r1 + 6 * mmsize]
> -    movu        m4, [r1 + 7 * mmsize]
> -    psllw       m1, m0
> -    psllw       m2, m0
> -    psllw       m3, m0
> -    psllw       m4, m0
> -    movu        [r0 + r2 * 2], m1
> -    lea         r0, [r0 + 2 * r2]
> -    movu        [r0 + r2], m2
> -    movu        [r0 + 2 * r2], m3
> -    lea         r0, [r0 + 2 * r2]
> -    movu        [r0 + r2], m4
> +    mova        m2, [r1 + 4 * mmsize]
> +    mova        m3, [r1 + 5 * mmsize]
> +    mova        m4, [r1 + 6 * mmsize]
> +    mova        m5, [r1 + 7 * mmsize]
> +    lea         r0, [r0 + r2 * 4]
> +    psubw       m2, m1
> +    psubw       m3, m1
> +    psubw       m4, m1
> +    psubw       m5, m1
> +    psraw       m2, m0
> +    psraw       m3, m0
> +    psraw       m4, m0
> +    psraw       m5, m0
> +    mova        [r0], m2
> +    mova        [r0 + r2], m3
> +    mova        [r0 + r2 * 2], m4
> +    mova        [r0 + r3], m5
>      RET
>  
> +
> +INIT_YMM avx2
> +cglobal cpy1Dto2D_shr_8, 3, 4, 4
> +    add         r2d, r2d
> +    movd        xm0, r3m
> +    pcmpeqw	m1, m1
> +    psllw       m1, xm0
> +    psraw       m1, 1
> +    lea         r3, [r2 * 3]
> +
> +    ; Row 0-3
> +    movu        m2, [r1 + 0 * mmsize]
> +    movu        m3, [r1 + 1 * mmsize]
> +    psubw       m2, m1
> +    psubw       m3, m1
> +    psraw       m2, xm0
> +    psraw       m3, xm0
> +    movu        [r0], xm2
> +    vextracti128 [r0 + r2], m2, 1
> +    movu        [r0 + r2 * 2], xm3
> +    vextracti128 [r0 + r3], m3, 1
> +
> +    ; Row 4-7
> +    movu        m2, [r1 + 2 * mmsize]
> +    movu        m3, [r1 + 3 * mmsize]
> +    lea         r0, [r0 + r2 * 4]
> +    psubw       m2, m1
> +    psubw       m3, m1
> +    psraw       m2, xm0
> +    psraw       m3, xm0
> +    movu        [r0], xm2
> +    vextracti128 [r0 + r2], m2, 1
> +    movu        [r0 + r2 * 2], xm3
> +    vextracti128 [r0 + r3], m3, 1
> +    RET
> +
> +
>  ;--------------------------------------------------------------------------------------
> -; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
> +; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
>  ;--------------------------------------------------------------------------------------
>  INIT_XMM sse2
> -cglobal copy_shl_16, 3,4,5
> +cglobal cpy1Dto2D_shr_16, 3, 5, 6
>      add         r2d, r2d
>      movd        m0, r3m
> -    mov         r3d, 256/64
> +    pcmpeqw	m1, m1
> +    psllw       m1, m0
> +    psraw       m1, 1
> +    mov         r3d, 16/4
> +    lea         r4, [r2 * 3]
>  
>  .loop:
> -    ; Row 0-3
> -    movu        m1, [r1 + 0 * mmsize]
> -    movu        m2, [r1 + 1 * mmsize]
> -    movu        m3, [r1 + 2 * mmsize]
> -    movu        m4, [r1 + 3 * mmsize]
> +    ; Row 0-1
> +    mova        m2, [r1 + 0 * mmsize]
> +    mova        m3, [r1 + 1 * mmsize]
> +    mova        m4, [r1 + 2 * mmsize]
> +    mova        m5, [r1 + 3 * mmsize]
> +    psubw       m2, m1
> +    psubw       m3, m1
> +    psubw       m4, m1
> +    psubw       m5, m1
> +    psraw       m2, m0
> +    psraw       m3, m0
> +    psraw       m4, m0
> +    psraw       m5, m0
> +    mova        [r0], m2
> +    mova        [r0 + mmsize], m3
> +    mova        [r0 + r2], m4
> +    mova        [r0 + r2 + mmsize], m5
> +
> +    ; Row 2-3
> +    mova        m2, [r1 + 4 * mmsize]
> +    mova        m3, [r1 + 5 * mmsize]
> +    mova        m4, [r1 + 6 * mmsize]
> +    mova        m5, [r1 + 7 * mmsize]
> +    psubw       m2, m1
> +    psubw       m3, m1
> +    psubw       m4, m1
> +    psubw       m5, m1
> +    psraw       m2, m0
> +    psraw       m3, m0
> +    psraw       m4, m0
> +    psraw       m5, m0
> +    mova        [r0 + r2 * 2], m2
> +    mova        [r0 + r2 * 2 + mmsize], m3
> +    mova        [r0 + r4], m4
> +    mova        [r0 + r4 + mmsize], m5
> +
> +    add         r1, 8 * mmsize
> +    lea         r0, [r0 + r2 * 4]
> +    dec         r3d
> +    jnz        .loop
> +    RET
> +
> +
> +INIT_YMM avx2
> +cglobal cpy1Dto2D_shr_16, 3, 5, 4
> +    add         r2d, r2d
> +    movd        xm0, r3m
> +    pcmpeqw	m1, m1
> +    psllw       m1, xm0
> +    psraw       m1, 1
> +    mov         r3d, 16/4
> +    lea         r4, [r2 * 3]
> +
> +.loop:
> +    ; Row 0-1
> +    movu        m2, [r1 + 0 * mmsize]
> +    movu        m3, [r1 + 1 * mmsize]
> +    psubw       m2, m1
> +    psubw       m3, m1
> +    psraw       m2, xm0
> +    psraw       m3, xm0
> +    movu        [r0], m2
> +    movu        [r0 + r2], m3
> +
> +    ; Row 2-3
> +    movu        m2, [r1 + 2 * mmsize]
> +    movu        m3, [r1 + 3 * mmsize]
> +    psubw       m2, m1
> +    psubw       m3, m1
> +    psraw       m2, xm0
> +    psraw       m3, xm0
> +    movu        [r0 + r2 * 2], m2
> +    movu        [r0 + r4], m3
> +
> +    add         r1, 4 * mmsize
> +    lea         r0, [r0 + r2 * 4]
> +    dec         r3d
> +    jnz        .loop
> +    RET
> +
> +
> +;--------------------------------------------------------------------------------------
> +; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
> +;--------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cpy1Dto2D_shr_32, 3, 4, 6
> +    add         r2d, r2d
> +    movd        m0, r3m
> +    pcmpeqw	m1, m1
>      psllw       m1, m0
> -    psllw       m2, m0
> -    psllw       m3, m0
> -    psllw       m4, m0
> -    movu        [r0], m1
> -    movu        [r0 + 16], m2
> -    movu        [r0 + r2], m3
> -    movu        [r0 + r2 + 16], m4
> -
> -    ; Row 4-7
> -    movu        m1, [r1 + 4 * mmsize]
> -    movu        m2, [r1 + 5 * mmsize]
> -    movu        m3, [r1 + 6 * mmsize]
> -    movu        m4, [r1 + 7 * mmsize]
> -    psllw       m1, m0
> -    psllw       m2, m0
> -    psllw       m3, m0
> -    psllw       m4, m0
> -    movu        [r0 + r2 * 2], m1
> -    movu        [r0 + r2 * 2 + 16], m2
> -    lea         r0, [r0 + r2 * 2]
> -    movu        [r0 + r2], m3
> -    movu        [r0 + r2 + 16], m4
> +    psraw       m1, 1
> +    mov         r3d, 32/2
> +
> +.loop:
> +    ; Row 0
> +    mova        m2, [r1 + 0 * mmsize]
> +    mova        m3, [r1 + 1 * mmsize]
> +    mova        m4, [r1 + 2 * mmsize]
> +    mova        m5, [r1 + 3 * mmsize]
> +    psubw       m2, m1
> +    psubw       m3, m1
> +    psubw       m4, m1
> +    psubw       m5, m1
> +    psraw       m2, m0
> +    psraw       m3, m0
> +    psraw       m4, m0
> +    psraw       m5, m0
> +    mova        [r0 + 0 * mmsize], m2
> +    mova        [r0 + 1 * mmsize], m3
> +    mova        [r0 + 2 * mmsize], m4
> +    mova        [r0 + 3 * mmsize], m5
> +
> +    ; Row 1
> +    mova        m2, [r1 + 4 * mmsize]
> +    mova        m3, [r1 + 5 * mmsize]
> +    mova        m4, [r1 + 6 * mmsize]
> +    mova        m5, [r1 + 7 * mmsize]
> +    psubw       m2, m1
> +    psubw       m3, m1
> +    psubw       m4, m1
> +    psubw       m5, m1
> +    psraw       m2, m0
> +    psraw       m3, m0
> +    psraw       m4, m0
> +    psraw       m5, m0
> +    mova        [r0 + r2 + 0 * mmsize], m2
> +    mova        [r0 + r2 + 1 * mmsize], m3
> +    mova        [r0 + r2 + 2 * mmsize], m4
> +    mova        [r0 + r2 + 3 * mmsize], m5
>  
>      add         r1, 8 * mmsize
>      lea         r0, [r0 + r2 * 2]
> @@ -4804,45 +4898,36 @@
>      jnz        .loop
>      RET
>  
> -;--------------------------------------------------------------------------------------
> -; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
> -;--------------------------------------------------------------------------------------
> -INIT_XMM sse2
> -cglobal copy_shl_32, 3,4,5
> +
> +INIT_YMM avx2
> +cglobal cpy1Dto2D_shr_32, 3, 4, 6
>      add         r2d, r2d
> -    movd        m0, r3m
> -    mov         r3d, 1024/64
> +    movd        xm0, r3m
> +    pcmpeqw	m1, m1
> +    psllw       m1, xm0
> +    psraw       m1, 1
> +    mov         r3d, 32/2
>  
>  .loop:
> -    ; Row 0-3
> -    movu        m1, [r1 + 0 * mmsize]
> -    movu        m2, [r1 + 1 * mmsize]
> -    movu        m3, [r1 + 2 * mmsize]
> -    movu        m4, [r1 + 3 * mmsize]
> -    psllw       m1, m0
> -    psllw       m2, m0
> -    psllw       m3, m0
> -    psllw       m4, m0
> -    movu        [r0], m1
> -    movu        [r0 + 16], m2
> -    movu        [r0 + 32], m3
> -    movu        [r0 + 48], m4
> -
> -    ; Row 4-7
> -    movu        m1, [r1 + 4 * mmsize]
> -    movu        m2, [r1 + 5 * mmsize]
> -    movu        m3, [r1 + 6 * mmsize]
> -    movu        m4, [r1 + 7 * mmsize]
> -    psllw       m1, m0
> -    psllw       m2, m0
> -    psllw       m3, m0
> -    psllw       m4, m0
> -    movu        [r0 + r2], m1
> -    movu        [r0 + r2 + 16], m2
> -    movu        [r0 + r2 + 32], m3
> -    movu        [r0 + r2 + 48], m4
> -
> -    add         r1, 8 * mmsize
> +    ; Row 0-1
> +    movu        m2, [r1 + 0 * mmsize]
> +    movu        m3, [r1 + 1 * mmsize]
> +    movu        m4, [r1 + 2 * mmsize]
> +    movu        m5, [r1 + 3 * mmsize]
> +    psubw       m2, m1
> +    psubw       m3, m1
> +    psubw       m4, m1
> +    psubw       m5, m1
> +    psraw       m2, xm0
> +    psraw       m3, xm0
> +    psraw       m4, xm0
> +    psraw       m5, xm0
> +    movu        [r0], m2
> +    movu        [r0 + mmsize], m3
> +    movu        [r0 + r2], m4
> +    movu        [r0 + r2 + mmsize], m5
> +
> +    add         r1, 4 * mmsize
>      lea         r0, [r0 + r2 * 2]
>      dec         r3d
>      jnz        .loop
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/blockcopy8.h
> --- a/source/common/x86/blockcopy8.h	Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/x86/blockcopy8.h	Thu Nov 27 10:12:03 2014 +0900
> @@ -24,32 +24,38 @@
>  #ifndef X265_BLOCKCOPY8_H
>  #define X265_BLOCKCOPY8_H
>  
> -void x265_cvt32to16_shl_4_sse2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_cvt32to16_shl_8_sse2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_cvt32to16_shl_16_sse2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_cvt32to16_shl_32_sse2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_cvt32to16_shl_4_avx2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_cvt32to16_shl_8_avx2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_cvt32to16_shl_16_avx2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_cvt32to16_shl_32_avx2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_copy16to16_shl_sse2(int16_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
> -void x265_cvt16to32_shr_4_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
> -void x265_cvt16to32_shr_8_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
> -void x265_cvt16to32_shr_16_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
> -void x265_cvt16to32_shr_32_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
> -void x265_copy_shr_sse4(int16_t* dst, const int16_t* src, intptr_t, int, int);
> -void x265_copy_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
> -void x265_copy_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
> -void x265_copy_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
> -void x265_copy_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
> -uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t);
> -uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t);
> -uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t);
> -uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t);
> -uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t);
> -uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t);
> -uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t);
> -uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t);
> +void x265_cpy2Dto1D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy2Dto1D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy2Dto1D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy2Dto1D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy2Dto1D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy2Dto1D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy2Dto1D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy2Dto1D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy1Dto2D_shl_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy1Dto2D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy1Dto2D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy1Dto2D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy1Dto2D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shr_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy1Dto2D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
> +uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
> +uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
> +uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
> +uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
> +uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
> +uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
> +uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
>  
>  #define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
>      void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
> @@ -181,17 +187,17 @@
>  void x265_blockfill_s_8x8_sse2(int16_t* dst, intptr_t dstride, int16_t val);
>  void x265_blockfill_s_16x16_sse2(int16_t* dst, intptr_t dstride, int16_t val);
>  void x265_blockfill_s_32x32_sse2(int16_t* dst, intptr_t dstride, int16_t val);
> -void x265_blockcopy_ss_16x4_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_16x8_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_16x12_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_16x16_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_16x24_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_16x32_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_16x64_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_64x16_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_64x32_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_64x48_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_64x64_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> +void x265_blockcopy_ss_16x4_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_16x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_16x12_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_16x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_16x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_16x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_16x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_64x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_64x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_64x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_64x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
>  
>  void x265_blockcopy_pp_32x8_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
>  void x265_blockcopy_pp_32x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/dct8.asm
> --- a/source/common/x86/dct8.asm	Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/x86/dct8.asm	Thu Nov 27 10:12:03 2014 +0900
> @@ -318,7 +318,7 @@
>  cextern pw_ppppmmmm
>  
>  ;------------------------------------------------------
> -;void dct4(int16_t *src, int16_t *dst, intptr_t stride)
> +;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
>  ;------------------------------------------------------
>  INIT_XMM sse2
>  cglobal dct4, 3, 4, 8
> @@ -475,7 +475,7 @@
>      RET
>  
>  ;-------------------------------------------------------
> -;void idct4(int16_t *src, int16_t *dst, intptr_t stride)
> +;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
>  ;-------------------------------------------------------
>  INIT_XMM sse2
>  cglobal idct4, 3, 4, 7
> @@ -565,7 +565,7 @@
>      RET
>  
>  ;------------------------------------------------------
> -;void dst4(int16_t *src, int16_t *dst, intptr_t stride)
> +;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
>  ;------------------------------------------------------
>  INIT_XMM ssse3
>  %if ARCH_X86_64
> @@ -657,7 +657,7 @@
>      RET
>  
>  ;-------------------------------------------------------
> -;void idst4(int16_t *src, int16_t *dst, intptr_t stride)
> +;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
>  ;-------------------------------------------------------
>  INIT_XMM sse2
>  cglobal idst4, 3, 4, 7
> @@ -750,7 +750,7 @@
>  
>  
>  ;-------------------------------------------------------
> -; void dct8(int16_t *src, int16_t *dst, intptr_t stride)
> +; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
>  ;-------------------------------------------------------
>  INIT_XMM sse4
>  cglobal dct8, 3,6,7,0-16*mmsize
> @@ -974,7 +974,7 @@
>      RET
>  
>  ;-------------------------------------------------------
> -; void idct8(int16_t *src, int16_t *dst, intptr_t stride)
> +; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
>  ;-------------------------------------------------------
>  INIT_XMM ssse3
>  
> @@ -1164,7 +1164,7 @@
>  
>  
>  ;-----------------------------------------------------------------------------
> -; void denoise_dct(int16_t *dct, uint32_t *sum, uint16_t *offset, int size)
> +; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size)
>  ;-----------------------------------------------------------------------------
>  INIT_XMM sse4
>  cglobal denoise_dct, 4, 4, 6
> @@ -2106,7 +2106,7 @@
>  %endmacro
>  
>  ;-------------------------------------------------------
> -; void idct16(int16_t *src, int16_t *dst, intptr_t stride)
> +; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
>  ;-------------------------------------------------------
>  INIT_YMM avx2
>  cglobal idct16, 3, 7, 16, 0-16*mmsize
> @@ -2385,7 +2385,7 @@
>  %endmacro
>  
>  ;-------------------------------------------------------
> -; void idct32(int16_t *src, int16_t *dst, intptr_t stride)
> +; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
>  ;-------------------------------------------------------
>  
>  ; TODO: Reduce PHADDD instruction by PADDD
> @@ -2684,7 +2684,7 @@
>      RET
>  
>  ;-------------------------------------------------------
> -; void idct4(int16_t *src, int16_t *dst, intptr_t stride)
> +; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
>  ;-------------------------------------------------------
>  INIT_YMM avx2
>  cglobal idct4, 3, 4, 6
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/dct8.h
> --- a/source/common/x86/dct8.h	Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/x86/dct8.h	Thu Nov 27 10:12:03 2014 +0900
> @@ -23,21 +23,21 @@
>  
>  #ifndef X265_DCT8_H
>  #define X265_DCT8_H
> -void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_dct16_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_idct32_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> +void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
> +void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
> +void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
> +void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
> +void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
> +void x265_dct16_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
> +void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
>  
> -void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> +void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
> +void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
> +void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
> +void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t dstStride);
> +void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
> +void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
> +void x265_idct32_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
>  
>  void x265_denoise_dct_sse4(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
>  void x265_denoise_dct_avx2(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
> diff -r dfe0803ae6be -r b4454aa1b6ab source/encoder/search.cpp
> --- a/source/encoder/search.cpp	Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/encoder/search.cpp	Thu Nov 27 10:12:03 2014 +0900
> @@ -2211,8 +2211,8 @@
>              if (bTryZero)
>              {
>                  /* coincident blocks of the two reference pictures */
> -                const pixel *ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
> -                const pixel *ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
> +                const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
> +                const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
>                  intptr_t refStride = slice->m_mref[0][0].lumaStride;
>  
>                  primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
> diff -r dfe0803ae6be -r b4454aa1b6ab source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp	Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/test/pixelharness.cpp	Thu Nov 27 10:12:03 2014 +0900
> @@ -344,60 +344,7 @@
>      return true;
>  }
>  
> -bool PixelHarness::check_copy16to16_shl_t(cpy16to16_shl_t ref, cpy16to16_shl_t opt)
> -{
> -    ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
> -    ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
> -
> -    int j = 0;
> -    intptr_t stride = STRIDE;
> -    for (int i = 0; i < ITERS; i++)
> -    {
> -        int shift = (rand() % 7 + 1);
> -
> -        int index = i % TEST_CASES;
> -        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
> -        ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
> -
> -        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
> -            return false;
> -
> -        reportfail();
> -        j += INCR;
> -    }
> -
> -    return true;
> -}
> -
> -bool PixelHarness::check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt)
> -{
> -    ALIGN_VAR_16(int32_t, ref_dest[64 * 64]);
> -    ALIGN_VAR_16(int32_t, opt_dest[64 * 64]);
> -
> -    memset(ref_dest, 0xCD, sizeof(ref_dest));
> -    memset(opt_dest, 0xCD, sizeof(opt_dest));
> -
> -    int j = 0;
> -    intptr_t stride = STRIDE;
> -    for (int i = 0; i < ITERS; i++)
> -    {
> -        int shift = (rand() % 7 + 1);
> -
> -        int index = i % TEST_CASES;
> -        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
> -        ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
> -
> -        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
> -            return false;
> -
> -        reportfail();
> -        j += INCR;
> -    }
> -
> -    return true;
> -}
> -
> -bool PixelHarness::check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt)
> +bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)
>  {
>      ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
>      ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
> @@ -412,8 +359,36 @@
>          int shift = (rand() % 7 + 1);
>  
>          int index = i % TEST_CASES;
> -        checked(opt, opt_dest, int_test_buff[index] + j, stride, shift);
> -        ref(ref_dest, int_test_buff[index] + j, stride, shift);
> +        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
> +        ref(ref_dest, short_test_buff[index] + j, stride, shift);
> +
> +        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
> +            return false;
> +
> +        reportfail();
> +        j += INCR;
> +    }
> +
> +    return true;
> +}
> +
> +bool PixelHarness::check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt)
> +{
> +    ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
> +    ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
> +
> +    memset(ref_dest, 0xCD, sizeof(ref_dest));
> +    memset(opt_dest, 0xCD, sizeof(opt_dest));
> +
> +    int j = 0;
> +    intptr_t stride = STRIDE;
> +    for (int i = 0; i < ITERS; i++)
> +    {
> +        int shift = (rand() % 7 + 1);
> +
> +        int index = i % TEST_CASES;
> +        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
> +        ref(ref_dest, short_test_buff[index] + j, stride, shift);
>  
>          if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
>              return false;
> @@ -451,7 +426,7 @@
>      return true;
>  }
>  
> -bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt)
> +bool PixelHarness::check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
>  {
>      ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
>      ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
> @@ -466,8 +441,8 @@
>          int shift = (rand() % 7 + 1);
>  
>          int index = i % TEST_CASES;
> -        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
> -        ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
> +        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
> +        ref(ref_dest, short_test_buff[index] + j, stride, shift);
>  
>          if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
>              return false;
> @@ -479,7 +454,7 @@
>      return true;
>  }
>  
> -bool PixelHarness::check_copy_shl_t(copy_shl_t ref, copy_shl_t opt)
> +bool PixelHarness::check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt)
>  {
>      ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
>      ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
> @@ -1280,41 +1255,40 @@
>              }
>          }
>  
> -        if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i])
> +        if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shl[i])
>          {
> -            if (!check_cvt16to32_shr_t(ref.cvt16to32_shr[i], opt.cvt16to32_shr[i]))
> +            if (!check_cpy2Dto1D_shl_t(ref.cpy2Dto1D_shl[i], opt.cpy2Dto1D_shl[i]))
>              {
> -                printf("cvt16to32_shr failed!\n");
> +                printf("cpy2Dto1D_shl failed!\n");
>                  return false;
>              }
>          }
>  
> -        if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i])
> +        if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shr[i])
>          {
> -            if (!check_cvt32to16_shl_t(ref.cvt32to16_shl[i], opt.cvt32to16_shl[i]))
> +            if (!check_cpy2Dto1D_shr_t(ref.cpy2Dto1D_shr[i], opt.cpy2Dto1D_shr[i]))
>              {
> -                printf("cvt32to16_shl failed!\n");
> +                printf("cpy2Dto1D_shr failed!\n");
>                  return false;
>              }
>          }
>  
> -        if ((i < BLOCK_64x64) && opt.copy_shl[i])
> +        if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shl[i])
>          {
> -            if (!check_copy_shl_t(ref.copy_shl[i], opt.copy_shl[i]))
> +            if (!check_cpy1Dto2D_shl_t(ref.cpy1Dto2D_shl[i], opt.cpy1Dto2D_shl[i]))
>              {
> -                printf("copy_shl[%dx%d] failed!\n", 4 << i, 4 << i);
> +                printf("cpy1Dto2D_shl[%dx%d] failed!\n", 4 << i, 4 << i);
>                  return false;
>              }
>          }
>  
> -    }
> -
> -    if (opt.cpy16to16_shl)
> -    {
> -        if (!check_copy16to16_shl_t(ref.cpy16to16_shl, opt.cpy16to16_shl))
> +        if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shr[i])
>          {
> -            printf("copy16to16_shl failed!\n");
> -            return false;
> +            if (!check_cpy1Dto2D_shr_t(ref.cpy1Dto2D_shr[i], opt.cpy1Dto2D_shr[i]))
> +            {
> +                printf("cpy1Dto2D_shr[%dx%d] failed!\n", 4 << i, 4 << i);
> +                return false;
> +            }
>          }
>      }
>  
> @@ -1408,15 +1382,6 @@
>          }
>      }
>  
> -    if (opt.copy_shr)
> -    {
> -        if (!check_copy_shr_t(ref.copy_shr, opt.copy_shr))
> -        {
> -            printf("copy_shr failed!\n");
> -            return false;
> -        }
> -    }
> -
>      return true;
>  }
>  
> @@ -1637,16 +1602,28 @@
>              REPORT_SPEEDUP(opt.var[i], ref.var[i], pbuf1, STRIDE);
>          }
>  
> -        if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i])
> +        if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shl[i])
>          {
> -            HEADER("cvt16to32_shr[%dx%d]", 4 << i, 4 << i);
> -            REPORT_SPEEDUP(opt.cvt16to32_shr[i], ref.cvt16to32_shr[i], ibuf1, sbuf2, STRIDE, 3, 4);
> +            HEADER("cpy2Dto1D_shl[%dx%d]", 4 << i, 4 << i);
> +            REPORT_SPEEDUP(opt.cpy2Dto1D_shl[i], ref.cpy2Dto1D_shl[i], sbuf1, sbuf2, STRIDE, MAX_TR_DYNAMIC_RANGE - X265_DEPTH - (i + 2));
>          }
>  
> -        if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i])
> +        if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shr[i])
>          {
> -            HEADER("cvt32to16_shl[%dx%d]", 4 << i, 4 << i);
> -            REPORT_SPEEDUP(opt.cvt32to16_shl[i], ref.cvt32to16_shl[i], sbuf2, ibuf1, STRIDE, 3);
> +            HEADER("cpy2Dto1D_shr[%dx%d]", 4 << i, 4 << i);
> +            REPORT_SPEEDUP(opt.cpy2Dto1D_shr[i], ref.cpy2Dto1D_shr[i], sbuf1, sbuf2, STRIDE, 3);
> +        }
> +
> +        if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shl[i])
> +        {
> +            HEADER("cpy1Dto2D_shl[%dx%d]", 4 << i, 4 << i);
> +            REPORT_SPEEDUP(opt.cpy1Dto2D_shl[i], ref.cpy1Dto2D_shl[i], sbuf1, sbuf2, STRIDE, 64);
> +        }
> +
> +        if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shr[i])
> +        {
> +            HEADER("cpy1Dto2D_shr[%dx%d]", 4 << i, 4 << i);
> +            REPORT_SPEEDUP(opt.cpy1Dto2D_shr[i], ref.cpy1Dto2D_shr[i], sbuf1, sbuf2, STRIDE, 64);
>          }
>  
>          if ((i < BLOCK_64x64) && opt.copy_cnt[i])
> @@ -1654,19 +1631,6 @@
>              HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i);
>              REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE);
>          }
> -
> -        if ((i < BLOCK_64x64) && opt.copy_shl[i])
> -        {
> -            HEADER("copy_shl[%dx%d]", 4 << i, 4 << i);
> -            REPORT_SPEEDUP(opt.copy_shl[i], ref.copy_shl[i], sbuf1, sbuf2, STRIDE, 64);
> -        }
> -
> -    }
> -
> -    if (opt.cpy16to16_shl)
> -    {
> -        HEADER0("cpy16to16_shl");
> -        REPORT_SPEEDUP(opt.cpy16to16_shl, ref.cpy16to16_shl, sbuf2, sbuf1, 64, 5, 64);
>      }
>  
>      if (opt.weight_pp)
> @@ -1728,11 +1692,4 @@
>          HEADER0("planecopy_cp");
>          REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1, 64, 64, 64, 2);
>      }
> -
> -    if (opt.copy_shr)
> -    {
> -        HEADER0("copy_shr");
> -        REPORT_SPEEDUP(opt.copy_shr, ref.copy_shr, sbuf1, sbuf2, 64, 5, 64);
> -    }
> -
>  }
> diff -r dfe0803ae6be -r b4454aa1b6ab source/test/pixelharness.h
> --- a/source/test/pixelharness.h	Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/test/pixelharness.h	Thu Nov 27 10:12:03 2014 +0900
> @@ -80,12 +80,11 @@
>      bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);
>      bool check_weightp(weightp_sp_t ref, weightp_sp_t opt);
>      bool check_downscale_t(downscale_t ref, downscale_t opt);
> -    bool check_copy16to16_shl_t(cpy16to16_shl_t ref, cpy16to16_shl_t opt);
> -    bool check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt);
> -    bool check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt);
> +    bool check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt);
> +    bool check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt);
> +    bool check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
> +    bool check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt);
>      bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
> -    bool check_copy_shr_t(copy_shr_t ref, copy_shr_t opt);
> -    bool check_copy_shl_t(copy_shl_t ref, copy_shl_t opt);
>      bool check_pixel_var(var_t ref, var_t opt);
>      bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
>      bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list