[x265] primitives: refactor tskip related

Satoshi Nakagawa nakagawa424 at oki.com
Thu Nov 27 02:14:26 CET 2014


# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1417050723 -32400
#      Thu Nov 27 10:12:03 2014 +0900
# Node ID b4454aa1b6ab610c20241eb8fd5c73268b1ae3e0
# Parent  dfe0803ae6be925281cd6101fc0354a34bedfefd
primitives: refactor tskip related

diff -r dfe0803ae6be -r b4454aa1b6ab source/common/dct.cpp
--- a/source/common/dct.cpp	Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/dct.cpp	Thu Nov 27 10:12:03 2014 +0900
@@ -440,7 +440,7 @@
     }
 }
 
-void dst4_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 1 + X265_DEPTH - 8;
     const int shift_2nd = 8;
@@ -450,14 +450,14 @@
 
     for (int i = 0; i < 4; i++)
     {
-        memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
+        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
     }
 
     fastForwardDst(block, coef, shift_1st);
     fastForwardDst(coef, dst, shift_2nd);
 }
 
-void dct4_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 1 + X265_DEPTH - 8;
     const int shift_2nd = 8;
@@ -467,14 +467,14 @@
 
     for (int i = 0; i < 4; i++)
     {
-        memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
+        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
     }
 
     partialButterfly4(block, coef, shift_1st, 4);
     partialButterfly4(coef, dst, shift_2nd, 4);
 }
 
-void dct8_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 2 + X265_DEPTH - 8;
     const int shift_2nd = 9;
@@ -484,14 +484,14 @@
 
     for (int i = 0; i < 8; i++)
     {
-        memcpy(&block[i * 8], &src[i * stride], 8 * sizeof(int16_t));
+        memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
     }
 
     partialButterfly8(block, coef, shift_1st, 8);
     partialButterfly8(coef, dst, shift_2nd, 8);
 }
 
-void dct16_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 3 + X265_DEPTH - 8;
     const int shift_2nd = 10;
@@ -501,14 +501,14 @@
 
     for (int i = 0; i < 16; i++)
     {
-        memcpy(&block[i * 16], &src[i * stride], 16 * sizeof(int16_t));
+        memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
     }
 
     partialButterfly16(block, coef, shift_1st, 16);
     partialButterfly16(coef, dst, shift_2nd, 16);
 }
 
-void dct32_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
 {
     const int shift_1st = 4 + X265_DEPTH - 8;
     const int shift_2nd = 11;
@@ -518,14 +518,14 @@
 
     for (int i = 0; i < 32; i++)
     {
-        memcpy(&block[i * 32], &src[i * stride], 32 * sizeof(int16_t));
+        memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
     }
 
     partialButterfly32(block, coef, shift_1st, 32);
     partialButterfly32(coef, dst, shift_2nd, 32);
 }
 
-void idst4_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -538,11 +538,11 @@
 
     for (int i = 0; i < 4; i++)
     {
-        memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t));
+        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
     }
 }
 
-void idct4_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -555,11 +555,11 @@
 
     for (int i = 0; i < 4; i++)
     {
-        memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t));
+        memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
     }
 }
 
-void idct8_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -569,13 +569,14 @@
 
     partialButterflyInverse8(src, coef, shift_1st, 8);
     partialButterflyInverse8(coef, block, shift_2nd, 8);
+
     for (int i = 0; i < 8; i++)
     {
-        memcpy(&dst[i * stride], &block[i * 8], 8 * sizeof(int16_t));
+        memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t));
     }
 }
 
-void idct16_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -585,13 +586,14 @@
 
     partialButterflyInverse16(src, coef, shift_1st, 16);
     partialButterflyInverse16(coef, block, shift_2nd, 16);
+
     for (int i = 0; i < 16; i++)
     {
-        memcpy(&dst[i * stride], &block[i * 16], 16 * sizeof(int16_t));
+        memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
     }
 }
 
-void idct32_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
 {
     const int shift_1st = 7;
     const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -604,7 +606,7 @@
 
     for (int i = 0; i < 32; i++)
     {
-        memcpy(&dst[i * stride], &block[i * 32], 32 * sizeof(int16_t));
+        memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
     }
 }
 
@@ -632,7 +634,7 @@
     }
 }
 
-void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int16_t* coef, int num, int per, int shift)
+void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
 {
     X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
 
@@ -724,15 +726,15 @@
 }
 
 template<int trSize>
-uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t stride)
+uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
 {
     uint32_t numSig = 0;
     for (int k = 0; k < trSize; k++)
     {
         for (int j = 0; j < trSize; j++)
         {
-            coeff[k * trSize + j] = residual[k * stride + j];
-            numSig += (residual[k * stride + j] != 0);
+            coeff[k * trSize + j] = residual[k * resiStride + j];
+            numSig += (residual[k * resiStride + j] != 0);
         }
     }
 
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/pixel.cpp
--- a/source/common/pixel.cpp	Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/pixel.cpp	Thu Nov 27 10:12:03 2014 +0900
@@ -32,32 +32,32 @@
 
 using namespace x265;
 
-#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
-    p.FUNC_PREFIX[LUMA_4x4]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4,  4, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x8]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x4]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8,  4, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_4x8]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x8]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x16]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x12] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_12x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x4]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16,  4, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_4x16]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x24] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_24x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x8]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x32]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x48] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_48x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
+#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \
+    p.FUNC_PREFIX[LUMA_4x4]   = FUNC_PREFIX_DEF<4,  4, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_8x8]   = FUNC_PREFIX_DEF<8,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_8x4]   = FUNC_PREFIX_DEF<8,  4, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_4x8]   = FUNC_PREFIX_DEF<4,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x8]  = FUNC_PREFIX_DEF<16,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_8x16]  = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x4]  = FUNC_PREFIX_DEF<16,  4, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_4x16]  = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x8]  = FUNC_PREFIX_DEF<32,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_8x32]  = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
 
 #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
     p.FUNC_PREFIX[LUMA_4x4]   = FUNC_PREFIX<4,  4>; \
@@ -491,73 +491,73 @@
     }
 }
 
-void copy16to16_shl(int16_t *dst, const int16_t *src, intptr_t stride, int shift, int size)
+template<int size>
+void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
 {
-    X265_CHECK(!(size & 3), "invalid size\n");
+    X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+    X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
+    X265_CHECK(shift >= 0, "invalid shift\n");
+
     for (int i = 0; i < size; i++)
     {
         for (int j = 0; j < size; j++)
-        {
-            dst[i * size + j] = src[i * stride + j] << shift;
-        }
+            dst[j] = src[j] << shift;
+
+        src += srcStride;
+        dst += size;
     }
 }
 
 template<int size>
-void convert16to32_shr(int32_t* dst, const int16_t* src, intptr_t stride, int shift, int offset)
+void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
 {
+    X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+    X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
+    X265_CHECK(shift > 0, "invalid shift\n");
+
+    int16_t round = 1 << (shift - 1);
     for (int i = 0; i < size; i++)
     {
         for (int j = 0; j < size; j++)
-        {
-            dst[i * size + j] = ((int)src[i * stride + j] + offset) >> shift;
-        }
+            dst[j] = (src[j] + round) >> shift;
+
+        src += srcStride;
+        dst += size;
     }
 }
 
-void copy_shr(int16_t* dst, const int16_t* src, intptr_t stride, int shift, int size)
+template<int size>
+void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 {
-    int round = 1 << (shift - 1);
+    X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
+    X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+    X265_CHECK(shift >= 0, "invalid shift\n");
 
     for (int i = 0; i < size; i++)
     {
         for (int j = 0; j < size; j++)
-        {
-            dst[j] = (int16_t)((src[j] + round) >> shift);
-        }
+            dst[j] = src[j] << shift;
 
         src += size;
-        dst += stride;
+        dst += dstStride;
     }
 }
 
 template<int size>
-void convert32to16_shl(int16_t* dst, const int32_t* src, intptr_t stride, int shift)
+void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 {
+    X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
+    X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+    X265_CHECK(shift > 0, "invalid shift\n");
+
+    int16_t round = 1 << (shift - 1);
     for (int i = 0; i < size; i++)
     {
         for (int j = 0; j < size; j++)
-        {
-            dst[j] = ((int16_t)src[j] << shift);
-        }
+            dst[j] = (src[j] + round) >> shift;
 
         src += size;
-        dst += stride;
-    }
-}
-
-template<int size>
-void copy_shl(int16_t* dst, const int16_t* src, intptr_t stride, int shift)
-{
-    for (int i = 0; i < size; i++)
-    {
-        for (int j = 0; j < size; j++)
-        {
-            dst[j] = (src[j] << shift);
-        }
-
-        src += size;
-        dst += stride;
+        dst += dstStride;
     }
 }
 
@@ -1263,9 +1263,9 @@
     CHROMA_444(64, 16);
     CHROMA_444(16, 64);
 
-    SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, pixel, pixel)
-    SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, pixelcmp_sp_t, int16_t, pixel)
-    SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, pixelcmp_ss_t, int16_t, int16_t)
+    SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel)
+    SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel)
+    SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t)
 
     p.blockfill_s[BLOCK_4x4]   = blockfil_s_c<4>;
     p.blockfill_s[BLOCK_8x8]   = blockfil_s_c<8>;
@@ -1273,21 +1273,22 @@
     p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
     p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
 
-    p.cpy16to16_shl = copy16to16_shl;
-    p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>;
-    p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>;
-    p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>;
-    p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>;
-    p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>;
-    p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>;
-    p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>;
-    p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>;
-
-    p.copy_shr = copy_shr;
-    p.copy_shl[BLOCK_4x4] = copy_shl<4>;
-    p.copy_shl[BLOCK_8x8] = copy_shl<8>;
-    p.copy_shl[BLOCK_16x16] = copy_shl<16>;
-    p.copy_shl[BLOCK_32x32] = copy_shl<32>;
+    p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>;
+    p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>;
+    p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>;
+    p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>;
+    p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>;
+    p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>;
+    p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>;
+    p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>;
+    p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>;
+    p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>;
+    p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>;
+    p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>;
+    p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>;
+    p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>;
+    p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>;
+    p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>;
 
     p.sa8d[BLOCK_4x4]   = satd_4x4;
     p.sa8d[BLOCK_8x8]   = sa8d_8x8;
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/primitives.h
--- a/source/common/primitives.h	Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/primitives.h	Thu Nov 27 10:12:03 2014 +0900
@@ -138,32 +138,27 @@
 typedef int  (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
 typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
 typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-typedef void (*blockcpy_sp_t)(int bx, int by, int16_t* dst, intptr_t dstride, const pixel* src, intptr_t sstride); // dst is aligned
-typedef void (*blockcpy_sc_t)(int bx, int by, int16_t* dst, intptr_t dstride, const uint8_t* src, intptr_t sstride); // dst is aligned
-typedef void (*pixelsub_ps_t)(int bx, int by, int16_t* dst, intptr_t dstride, const pixel* src0, const pixel* src1, intptr_t sstride0, intptr_t sstride1);
 typedef void (*pixelavg_pp_t)(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int weight);
 typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
 
 typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel* refLeft, pixel* refAbove, int dirMode, int bFilter);
 typedef void (*intra_allangs_t)(pixel* dst, pixel* above0, pixel* left0, pixel* above1, pixel* left1, int bLuma);
 
-typedef void (*cpy16to16_shl_t)(int16_t* dst, const int16_t* src, intptr_t, int, int);
-typedef void (*cvt16to32_shl_t)(int32_t* dst, const int16_t* src, intptr_t, int, int);
-typedef void (*cvt16to32_shr_t)(int32_t* dst, const int16_t* src, intptr_t, int, int);
-typedef void (*cvt32to16_shl_t)(int16_t* dst, const int32_t* src, intptr_t, int);
-typedef uint32_t (*copy_cnt_t)(int16_t* coeff, const int16_t* residual, intptr_t stride);
-typedef void (*copy_shr_t)(int16_t* dst, const int16_t* src, intptr_t stride, int shift, int size);
-typedef void (*copy_shl_t)(int16_t* dst, const int16_t* src, intptr_t stride, int shift);
+typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+typedef void (*cpy1Dto2D_shl_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+typedef void (*cpy1Dto2D_shr_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+typedef uint32_t (*copy_cnt_t)(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
 
-typedef void (*dct_t)(const int16_t* src, int16_t* dst, intptr_t stride);
-typedef void (*idct_t)(const int16_t* src, int16_t* dst, intptr_t stride);
+typedef void (*dct_t)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+typedef void (*idct_t)(const int16_t* src, int16_t* dst, intptr_t dstStride);
 typedef void (*denoiseDct_t)(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff);
 
 typedef void (*calcresidual_t)(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
 typedef void (*transpose_t)(pixel* dst, const pixel* src, intptr_t stride);
-typedef uint32_t (*quant_t)(const int16_t *coef, const int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
-typedef uint32_t (*nquant_t)(const int16_t *coef, const int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
-typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t*vdequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
+typedef uint32_t (*quant_t)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+typedef uint32_t (*nquant_t)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
+typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
 typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
 typedef int  (*count_nonzero_t)(const int16_t* quantCoeff, int numCoeff);
 
@@ -186,7 +181,7 @@
 typedef void (*filter_hv_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
 typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
 
-typedef void (*copy_pp_t)(pixel* dst, intptr_t dstride, const pixel* src, intptr_t sstride); // dst is aligned
+typedef void (*copy_pp_t)(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); // dst is aligned
 typedef void (*copy_sp_t)(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
 typedef void (*copy_ps_t)(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 typedef void (*copy_ss_t)(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
@@ -195,7 +190,7 @@
 typedef void (*pixel_add_ps_t)(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
 typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
 
-typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t*  offsetEo, int width, int8_t signLeft);
+typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 
@@ -220,12 +215,11 @@
     pixelcmp_ss_t   psy_cost_ss[NUM_SQUARE_BLOCKS];
 
     blockfill_s_t   blockfill_s[NUM_SQUARE_BLOCKS];  // block fill with value
-    cpy16to16_shl_t cpy16to16_shl;
-    cvt16to32_shr_t cvt16to32_shr[NUM_SQUARE_BLOCKS - 1];
-    cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1];
+    cpy2Dto1D_shl_t cpy2Dto1D_shl[NUM_SQUARE_BLOCKS - 1];
+    cpy2Dto1D_shr_t cpy2Dto1D_shr[NUM_SQUARE_BLOCKS - 1];
+    cpy1Dto2D_shl_t cpy1Dto2D_shl[NUM_SQUARE_BLOCKS - 1];
+    cpy1Dto2D_shr_t cpy1Dto2D_shr[NUM_SQUARE_BLOCKS - 1];
     copy_cnt_t      copy_cnt[NUM_SQUARE_BLOCKS - 1];
-    copy_shr_t      copy_shr;
-    copy_shl_t      copy_shl[NUM_SQUARE_BLOCKS - 1];
 
     copy_pp_t       luma_copy_pp[NUM_LUMA_PARTITIONS];
     copy_sp_t       luma_copy_sp[NUM_LUMA_PARTITIONS];
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/quant.cpp
--- a/source/common/quant.cpp	Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/quant.cpp	Thu Nov 27 10:12:03 2014 +0900
@@ -322,49 +322,46 @@
     return numSig;
 }
 
-uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t stride,
+uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride,
                              coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
 {
+    const uint32_t sizeIdx = log2TrSize - 2;
     if (cu.m_tqBypass[absPartIdx])
     {
         X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
-        return primitives.copy_cnt[log2TrSize - 2](coeff, residual, stride);
+        return primitives.copy_cnt[sizeIdx](coeff, residual, resiStride);
     }
 
     bool isLuma  = ttype == TEXT_LUMA;
     bool usePsy  = m_psyRdoqScale && isLuma && !useTransformSkip;
     int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
-    int trSize = 1 << log2TrSize;
 
     X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n");
     if (useTransformSkip)
     {
 #if X265_DEPTH <= 10
-        primitives.cpy16to16_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
+        X265_CHECK(transformShift >= 0, "invalid transformShift\n");
+        primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
 #else
         if (transformShift >= 0)
-            primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
+            primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
         else
-        {
-            int shift = -transformShift;
-            int offset = (1 << (shift - 1));
-            primitives.cvt16to32_shr[log2TrSize - 2](m_resiDctCoeff, residual, stride, shift, offset);
-        }
+            primitives.cpy2Dto1D_shr[sizeIdx](m_resiDctCoeff, residual, resiStride, -transformShift);
 #endif
     }
     else
     {
         bool isIntra = cu.isIntra(absPartIdx);
-        const uint32_t sizeIdx = log2TrSize - 2;
         int useDST = !sizeIdx && isLuma && isIntra;
         int index = DCT_4x4 + sizeIdx - useDST;
 
-        primitives.dct[index](residual, m_resiDctCoeff, stride);
+        primitives.dct[index](residual, m_resiDctCoeff, resiStride);
 
         /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
          * there is no risk of performing this DCT unnecessarily */
         if (usePsy)
         {
+            int trSize = 1 << log2TrSize;
             /* perform DCT on source pixels for psy-rdoq */
             primitives.square_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
             primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize);
@@ -408,12 +405,13 @@
     }
 }
 
-void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, const coeff_t* coeff,
+void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
                             uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
 {
+    const uint32_t sizeIdx = log2TrSize - 2;
     if (transQuantBypass)
     {
-        primitives.copy_shl[log2TrSize - 2](residual, coeff, stride, 0);
+        primitives.cpy1Dto2D_shl[sizeIdx](residual, coeff, resiStride, 0);
         return;
     }
 
@@ -427,7 +425,7 @@
     if (m_scalingList->m_bEnabled)
     {
         int scalingListType = (bIntra ? 0 : 3) + ttype;
-        const int32_t* dequantCoef = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
+        const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem];
         primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
     }
     else
@@ -438,20 +436,18 @@
 
     if (useTransformSkip)
     {
-        int trSize = 1 << log2TrSize;
-
 #if X265_DEPTH <= 10
-        primitives.copy_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
+        X265_CHECK(transformShift > 0, "invalid transformShift\n");
+        primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
 #else
         if (transformShift > 0)
-            primitives.copy_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
+            primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
         else
-            primitives.cvt32to16_shl[log2TrSize - 2](residual, m_resiDctCoeff, stride, -transformShift);
+            primitives.cpy1Dto2D_shl[sizeIdx](residual, m_resiDctCoeff, resiStride, -transformShift);
 #endif
     }
     else
     {
-        const uint32_t sizeIdx = log2TrSize - 2;
         int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
 
         X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n");
@@ -459,17 +455,17 @@
         // DC only
         if (numSig == 1 && coeff[0] != 0 && !useDST)
         {
-            const int shift_1st = 7;
+            const int shift_1st = 7 - 6;
             const int add_1st = 1 << (shift_1st - 1);
-            const int shift_2nd = 12 - (X265_DEPTH - 8);
+            const int shift_2nd = 12 - (X265_DEPTH - 8) - 3;
             const int add_2nd = 1 << (shift_2nd - 1);
 
-            int dc_val = (((m_resiDctCoeff[0] * 64 + add_1st) >> shift_1st) * 64 + add_2nd) >> shift_2nd;
-            primitives.blockfill_s[sizeIdx](residual, stride, (int16_t)dc_val);
+            int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
+            primitives.blockfill_s[sizeIdx](residual, resiStride, (int16_t)dc_val);
             return;
         }
 
-        primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, stride);
+        primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, resiStride);
     }
 }
 
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/quant.h
--- a/source/common/quant.h	Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/quant.h	Thu Nov 27 10:12:03 2014 +0900
@@ -104,10 +104,10 @@
     /* CU setup */
     void setQPforQuant(const CUData& ctu);
 
-    uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencstride, const int16_t* residual, uint32_t stride, coeff_t* coeff,
+    uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
                           uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip);
 
-    void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, const coeff_t* coeff,
+    void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
                          uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
 
     /* static methods shared with entropy.cpp */
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp	Thu Nov 27 10:12:03 2014 +0900
@@ -1336,10 +1336,22 @@
         p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
         p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
 
-        p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
-        p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
-        p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
-        p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
+        p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
+        p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
+        p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
+        p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
+        p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
+        p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
+        p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
+        p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
+        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
+        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
+        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
+        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
+        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
+        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
+        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
+        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
 
         CHROMA_PIXELSUB_PS(_sse2);
         CHROMA_PIXELSUB_PS_422(_sse2);
@@ -1406,10 +1418,6 @@
         p.quant = x265_quant_sse4;
         p.nquant = x265_nquant_sse4;
         p.dequant_normal = x265_dequant_normal_sse4;
-        p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
-        p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
-        p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
-        p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
         p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4;
         p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4;
         p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4;
@@ -1438,6 +1446,14 @@
         p.nquant = x265_nquant_avx2;
         p.dequant_normal = x265_dequant_normal_avx2;
         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
+        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
+        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
+        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
+        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
+        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
+        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
+        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
+        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
 #if X86_64
         p.dct[DCT_8x8] = x265_dct8_avx2;
         p.dct[DCT_16x16] = x265_dct16_avx2;
@@ -1548,11 +1564,23 @@
         p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
         SA8D_INTER_FROM_BLOCK(sse2);
 
-        p.cpy16to16_shl = x265_copy16to16_shl_sse2;
-        p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
-        p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
-        p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
-        p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
+        p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
+        p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
+        p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
+        p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
+        p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
+        p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
+        p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
+        p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
+        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
+        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
+        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
+        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
+        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
+        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
+        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
+        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
+
         p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
         p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
         p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
@@ -1568,10 +1596,6 @@
         p.idct[IDST_4x4] = x265_idst4_sse2;
 
         p.planecopy_sp = x265_downShift_16_sse2;
-        p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
-        p.copy_shl[BLOCK_8x8] = x265_copy_shl_8_sse2;
-        p.copy_shl[BLOCK_16x16] = x265_copy_shl_16_sse2;
-        p.copy_shl[BLOCK_32x32] = x265_copy_shl_32_sse2;
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
@@ -1615,10 +1639,6 @@
         LUMA_ADDAVG(_sse4);
         CHROMA_ADDAVG(_sse4);
         CHROMA_ADDAVG_422(_sse4);
-        p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
-        p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
-        p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
-        p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
 
         // TODO: check POPCNT flag!
         p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_sse4;
@@ -1688,7 +1708,6 @@
         INTRA_ANG_SSE4(sse4);
 
         p.dct[DCT_8x8] = x265_dct8_sse4;
-        p.copy_shr = x265_copy_shr_sse4;
 //        p.denoiseDct = x265_denoise_dct_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
@@ -1759,10 +1778,14 @@
         p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
         p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2;
 
-        p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
-        p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
-        p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
-        p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
+        p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
+        p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
+        p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
+        p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
+        p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
+        p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
+        p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
+        p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
 
 //        p.denoiseDct = x265_denoise_dct_avx2;
         p.dct[DCT_4x4] = x265_dct4_avx2;
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/x86/blockcopy8.asm	Thu Nov 27 10:12:03 2014 +0900
@@ -41,7 +41,7 @@
 SECTION .text
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_2x4, 4, 7, 0
@@ -59,7 +59,7 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_2x8, 4, 7, 0
@@ -97,7 +97,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_2x16, 4, 7, 0
@@ -115,7 +115,7 @@
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_4x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_4x2, 4, 6, 0
@@ -127,7 +127,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_4x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_4x4, 4, 4, 4
@@ -145,7 +145,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W4_H8 2
 INIT_XMM sse2
@@ -192,7 +192,7 @@
 BLOCKCOPY_PP_W4_H8 4, 32
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_6x8, 4, 7, 8
@@ -257,7 +257,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_6x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_6x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_6x16, 4, 7, 2
@@ -279,7 +279,7 @@
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_8x2, 4, 4, 2
@@ -291,7 +291,7 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_8x4, 4, 4, 4
@@ -309,7 +309,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x6(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_8x6, 4, 7, 6
@@ -333,7 +333,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x12(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_pp_8x12, 4, 5, 2
@@ -350,7 +350,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W8_H8 2
 INIT_XMM sse2
@@ -397,7 +397,7 @@
 BLOCKCOPY_PP_W8_H8 8, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W12_H4 2
 INIT_XMM sse2
@@ -439,7 +439,7 @@
 BLOCKCOPY_PP_W12_H4 12, 32
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_16x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W16_H4 2
 INIT_XMM sse2
@@ -471,7 +471,7 @@
 BLOCKCOPY_PP_W16_H4 16, 12
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W16_H8 2
 INIT_XMM sse2
@@ -519,7 +519,7 @@
 BLOCKCOPY_PP_W16_H8 16, 24
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W24_H4 2
 INIT_XMM sse2
@@ -560,7 +560,7 @@
 BLOCKCOPY_PP_W24_H4 24, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W32_H4 2
 INIT_XMM sse2
@@ -684,7 +684,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_32x24(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_32x24(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_YMM avx
 cglobal blockcopy_pp_32x24, 4, 7, 6
@@ -722,7 +722,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W32_H16_avx 2
 INIT_YMM avx
@@ -788,7 +788,7 @@
 BLOCKCOPY_PP_W32_H16_avx 32, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W48_H2 2
 INIT_XMM sse2
@@ -836,7 +836,7 @@
 BLOCKCOPY_PP_W48_H2 48, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PP_W64_H4 2
 INIT_XMM sse2
@@ -897,7 +897,7 @@
 BLOCKCOPY_PP_W64_H4 64, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal blockcopy_sp_2x4, 4, 5, 2
@@ -926,7 +926,7 @@
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_2x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal blockcopy_sp_2x8, 4, 5, 2
@@ -974,11 +974,11 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W2_H2 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 7, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 7, 2, dst, dstStride, src, srcStride
     add         r3,     r3
     mov         r6d,    %2/2
 .loop:
@@ -1003,10 +1003,10 @@
 BLOCKCOPY_SP_W2_H2 2, 16
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_sp_4x2, 4, 4, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x2, 4, 4, 2, dst, dstStride, src, srcStride
 
 add        r3,        r3
 
@@ -1022,10 +1022,10 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_sp_4x4, 4, 4, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x4, 4, 4, 4, dst, dstStride, src, srcStride
 
 add        r3,     r3
 
@@ -1049,10 +1049,10 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_sp_4x8, 4, 4, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x8, 4, 4, 8, dst, dstStride, src, srcStride
 
 add        r3,      r3
 
@@ -1092,11 +1092,11 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W4_H8 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
 
 mov         r4d,    %2/8
 
@@ -1150,7 +1150,7 @@
 BLOCKCOPY_SP_W4_H8 4, 32
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_6x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal blockcopy_sp_6x8, 4, 4, 2
@@ -1213,11 +1213,11 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W6_H2 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 7, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
     add         r3,     r3
     mov         r6d,    %2/2
 .loop:
@@ -1247,10 +1247,10 @@
 BLOCKCOPY_SP_W6_H2 6, 16
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_sp_8x2, 4, 4, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x2, 4, 4, 2, dst, dstStride, src, srcStride
 
 add        r3,         r3
 
@@ -1265,10 +1265,10 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_sp_8x4, 4, 4, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x4, 4, 4, 4, dst, dstStride, src, srcStride
 
 add        r3,     r3
 
@@ -1290,10 +1290,10 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x6(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x6(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_sp_8x6, 4, 4, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x6, 4, 4, 6, dst, dstStride, src, srcStride
 
 add        r3,      r3
 
@@ -1322,10 +1322,10 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_sp_8x8, 4, 4, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x8, 4, 4, 8, dst, dstStride, src, srcStride
 
 add        r3,      r3
 
@@ -1361,11 +1361,11 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W8_H4 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 4, dst, dstStride, src, srcStride
     add         r3,     r3
     mov         r4d,    %2/4
 .loop:
@@ -1391,11 +1391,11 @@
 BLOCKCOPY_SP_W8_H4 8, 12
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W8_H8 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
 
 mov         r4d,    %2/8
 
@@ -1446,11 +1446,11 @@
 BLOCKCOPY_SP_W8_H8 8, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W12_H4 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
 
 mov             r4d,     %2/4
 
@@ -1503,11 +1503,11 @@
 BLOCKCOPY_SP_W12_H4 12, 32
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W16_H4 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
 
 mov             r4d,     %2/4
 
@@ -1554,11 +1554,11 @@
 BLOCKCOPY_SP_W16_H4 16, 24
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W24_H2 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
 
 mov             r4d,     %2/2
 
@@ -1595,11 +1595,11 @@
 BLOCKCOPY_SP_W24_H2 24, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W32_H2 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
 
 mov             r4d,     %2/2
 
@@ -1643,11 +1643,11 @@
 BLOCKCOPY_SP_W32_H2 32, 48
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W48_H2 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
 
 mov             r4d,     %2
 
@@ -1681,11 +1681,11 @@
 BLOCKCOPY_SP_W48_H2 48, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SP_W64_H1 2
 INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
 
 mov             r4d,       %2
 
@@ -1726,10 +1726,10 @@
 BLOCKCOPY_SP_W64_H1 64, 64
 
 ;-----------------------------------------------------------------------------
-; void blockfill_s_4x4(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockfill_s_4x4, 3, 3, 1, dest, destStride, val
+cglobal blockfill_s_4x4, 3, 3, 1, dst, dstStride, val
 
 add        r1,            r1
 
@@ -1745,10 +1745,10 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockfill_s_8x8(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockfill_s_8x8, 3, 3, 1, dest, destStride, val
+cglobal blockfill_s_8x8, 3, 3, 1, dst, dstStride, val
 
 add        r1,            r1
 
@@ -1774,11 +1774,11 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
 ;-----------------------------------------------------------------------------
 %macro BLOCKFILL_S_W16_H8 2
 INIT_XMM sse2
-cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
+cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
 
 mov        r3d,           %2/8
 
@@ -1855,11 +1855,11 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
 ;-----------------------------------------------------------------------------
 %macro BLOCKFILL_S_W32_H4 2
 INIT_XMM sse2
-cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
+cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
 
 mov        r3d,           %2/4
 
@@ -1983,10 +1983,10 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_2x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x4, 4, 4, 1, dst, dstStride, src, srcStride
 
 add        r1,            r1
 
@@ -2013,10 +2013,10 @@
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x8(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x8(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_2x8, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x8, 4, 4, 1, dst, dstStride, src, srcStride
 
 add        r1,            r1
 
@@ -2065,10 +2065,10 @@
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x16(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x16(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x16, 4, 5, 2, dst, dstStride, src, srcStride
     add         r1,         r1
     mov         r4d,        16/2
 .loop:
@@ -2086,10 +2086,10 @@
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_4x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_4x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_4x2, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_4x2, 4, 4, 1, dst, dstStride, src, srcStride
 
 add        r1,         r1
 
@@ -2105,10 +2105,10 @@
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_4x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_4x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_4x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_4x4, 4, 4, 1, dst, dstStride, src, srcStride
 
 add        r1,            r1
 
@@ -2135,11 +2135,11 @@
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W4_H4 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
 
 add     r1,      r1
 mov    r4d,      %2/4
@@ -2180,11 +2180,11 @@
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W6_H4 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
 
 add     r1,      r1
 mov    r4d,      %2/4
@@ -2227,10 +2227,10 @@
 BLOCKCOPY_PS_W6_H4 6, 16
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_8x2, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x2, 4, 4, 1, dst, dstStride, src, srcStride
 
 add        r1,         r1
 
@@ -2245,10 +2245,10 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_8x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x4, 4, 4, 1, dst, dstStride, src, srcStride
 
 add        r1,            r1
 
@@ -2274,10 +2274,10 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x6(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x6(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_8x6, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x6, 4, 4, 1, dst, dstStride, src, srcStride
 
 add        r1,            r1
 
@@ -2314,11 +2314,11 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W8_H4 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
 
 add     r1,      r1
 mov    r4d,      %2/4
@@ -2361,11 +2361,11 @@
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W12_H2 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
 
 add        r1,      r1
 mov        r4d,     %2/2
@@ -2398,10 +2398,10 @@
 BLOCKCOPY_PS_W12_H2 12, 32
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_16x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_16x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal blockcopy_ps_16x4, 4, 4, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_16x4, 4, 4, 3, dst, dstStride, src, srcStride
 
 add        r1,      r1
 pxor       m0,      m0
@@ -2436,11 +2436,11 @@
 RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W16_H4 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
 
 add        r1,      r1
 mov        r4d,     %2/4
@@ -2492,11 +2492,11 @@
 BLOCKCOPY_PS_W16_H4 16, 24
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W24_H2 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
 
 add        r1,      r1
 mov        r4d,     %2/2
@@ -2537,11 +2537,11 @@
 BLOCKCOPY_PS_W24_H2 24, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W32_H2 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
 
 add        r1,      r1
 mov        r4d,     %2/2
@@ -2590,11 +2590,11 @@
 BLOCKCOPY_PS_W32_H2 32, 48
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W48_H2 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
 
 add        r1,      r1
 mov        r4d,     %2/2
@@ -2649,11 +2649,11 @@
 BLOCKCOPY_PS_W48_H2 48, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_PS_W64_H2 2
 INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
 
 add        r1,      r1
 mov        r4d,     %2/2
@@ -2723,7 +2723,7 @@
 BLOCKCOPY_PS_W64_H2 64, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_2x4, 4, 6, 0
@@ -2746,7 +2746,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_2x8, 4, 6, 0
@@ -2785,7 +2785,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_2x16, 4, 7, 0
@@ -2805,7 +2805,7 @@
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_4x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_4x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_4x2, 4, 4, 2
@@ -2821,7 +2821,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_4x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_4x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_4x4, 4, 4, 4
@@ -2841,7 +2841,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W4_H8 2
 INIT_XMM sse2
@@ -2889,7 +2889,7 @@
 BLOCKCOPY_SS_W4_H8 4, 32
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_6x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_6x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_6x8, 4, 4, 4
@@ -2944,7 +2944,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_6x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_6x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_6x16, 4, 5, 4
@@ -2968,7 +2968,7 @@
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_8x2, 4, 4, 2
@@ -2984,7 +2984,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_8x4, 4, 4, 4
@@ -3005,7 +3005,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x6(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x6(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_8x6, 4, 4, 4
@@ -3034,7 +3034,7 @@
     RET
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x12(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x12(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal blockcopy_ss_8x12, 4, 5, 2
@@ -3054,7 +3054,7 @@
 
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W8_H8 2
 INIT_XMM sse2
@@ -3105,7 +3105,7 @@
 BLOCKCOPY_SS_W8_H8 8, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W12_H4 2
 INIT_XMM sse2
@@ -3149,7 +3149,7 @@
 BLOCKCOPY_SS_W12_H4 12, 32
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_16x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_16x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W16_H4 2
 INIT_XMM sse2
@@ -3192,7 +3192,7 @@
 BLOCKCOPY_SS_W16_H4 16, 12
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W16_H4_avx 2
 INIT_YMM avx
@@ -3229,7 +3229,7 @@
 BLOCKCOPY_SS_W16_H4_avx 16, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W16_H8 2
 INIT_XMM sse2
@@ -3302,7 +3302,7 @@
 BLOCKCOPY_SS_W16_H8 16, 24
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W24_H4 2
 INIT_XMM sse2
@@ -3354,7 +3354,7 @@
 BLOCKCOPY_SS_W24_H4 24, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W32_H4 2
 INIT_XMM sse2
@@ -3422,7 +3422,7 @@
 BLOCKCOPY_SS_W32_H4 32, 48
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W48_H2 2
 INIT_XMM sse2
@@ -3500,11 +3500,11 @@
 BLOCKCOPY_SS_W48_H2 48, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W64_H4 2
 INIT_XMM sse2
-cglobal blockcopy_ss_%1x%2, 4, 5, 6, dest, deststride, src, srcstride
+cglobal blockcopy_ss_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
     mov     r4d, %2/4
     add     r1, r1
     add     r3, r3
@@ -3606,11 +3606,11 @@
 BLOCKCOPY_SS_W64_H4 64, 64
 
 ;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------
 %macro BLOCKCOPY_SS_W64_H4_avx 2
 INIT_YMM avx
-cglobal blockcopy_ss_%1x%2, 4, 7, 4, dest, deststride, src, srcstride
+cglobal blockcopy_ss_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
     mov     r4d, %2/4
     add     r1, r1
     add     r3, r3
@@ -3670,152 +3670,82 @@
 BLOCKCOPY_SS_W64_H4_avx 64, 64
 
 ;--------------------------------------------------------------------------------------
-; void copy16to16_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal copy16to16_shl, 5, 6, 2, dst, src, stride, shift, size
-%define shift       m1
-
-    ; make shift
-    movd            shift,    r3d
+cglobal cpy2Dto1D_shr_4, 3, 4, 4
+    add             r2d, r2d
+    movd            m0, r3m
+    pcmpeqw	    m1, m1
+    psllw           m1, m0
+    psraw           m1, 1
 
     ; register alloc
     ; r0 - dst
     ; r1 - src
-    ; r2 - stride
-    ; r4 - size
-
-    sub             r2d,      r4d
-    add             r2d,      r2d
-    mov             r5d,      r4d
-    shr             r4d,      2
-.loop_row:
-    mov             r3d,      r4d
-
-.loop_col:
-    movh            m0,       [r1]
-    psllw           m0,       shift
-    movh            [r0],     m0
-
-    add             r1,       8
-    add             r0,       8
-
-    dec             r3d
-    jnz             .loop_col
-
-    add             r1,       r2
-    dec             r5d
-    jnz             .loop_row
+    ; r2 - srcStride
+    ; m0 - shift
+    ; m1 - word [-round]
+
+    ; Row 0-3
+    movh            m2, [r1]
+    movhps          m2, [r1 + r2]
+    lea             r1, [r1 + r2 * 2]
+    movh            m3, [r1]
+    movhps          m3, [r1 + r2]
+    psubw           m2, m1
+    psubw           m3, m1
+    psraw           m2, m0
+    psraw           m3, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
     RET
 
 
 ;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 ;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_4, 3,3,3
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_8, 3, 5, 4
     add             r2d, r2d
     movd            m0, r3m
-    movd            m1, r4m
-    pshufd          m1, m1, 0
+    pcmpeqw	    m1, m1
+    psllw           m1, m0
+    psraw           m1, 1
+    mov             r3d, 8/4
+    lea             r4, [r2 * 3]
 
     ; register alloc
     ; r0 - dst
     ; r1 - src
-    ; r2 - stride
-    ; m0 - shift
-    ; m1 - dword [offset]
-
-    ; Row 0
-    pmovsxwd        m2, [r1]
-    paddd           m2, m1
-    psrad           m2, m0
-    movu            [r0 + 0 * mmsize], m2
-
-    ; Row 1
-    pmovsxwd        m2, [r1 + r2]
-    paddd           m2, m1
-    psrad           m2, m0
-    movu            [r0 + 1 * mmsize], m2
-
-    ; Row 2
-    lea             r1, [r1 + r2 * 2]
-    pmovsxwd        m2, [r1]
-    paddd           m2, m1
-    psrad           m2, m0
-    movu            [r0 + 2 * mmsize], m2
-
-    ; Row 3
-    pmovsxwd        m2, [r1 + r2]
-    paddd           m2, m1
-    psrad           m2, m0
-    movu            [r0 + 3 * mmsize], m2
-    RET
-
-
-;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
-;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_8, 3,5,3
-    add             r2d, r2d
-    movd            m0, r3m
-    movd            m1, r4m
-    pshufd          m1, m1, 0
-    mov             r3d, 8/4
-    lea             r4, [r2 * 3]
-
-    ; register alloc
-    ; r0 - dst
-    ; r1 - src
-    ; r2 - stride
+    ; r2 - srcStride
     ; r3 - loop counter
     ; r4 - stride * 3
     ; m0 - shift
-    ; m1 - dword [offset]
+    ; m1 - word [-round]
 
 .loop:
-    ; Row 0
-    pmovsxwd        m2, [r1]
-    pmovsxwd        m3, [r1 + mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    movu            [r0 + 0 * mmsize], m2
-    movu            [r0 + 1 * mmsize], m3
-
-    ; Row 1
-    pmovsxwd        m2, [r1 + r2]
-    pmovsxwd        m3, [r1 + r2 + mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    movu            [r0 + 2 * mmsize], m2
-    movu            [r0 + 3 * mmsize], m3
-
-    ; Row 2
-    pmovsxwd        m2, [r1 + r2 * 2]
-    pmovsxwd        m3, [r1 + r2 * 2 + mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    movu            [r0 + 4 * mmsize], m2
-    movu            [r0 + 5 * mmsize], m3
-
-    ; Row 3
-    pmovsxwd        m2, [r1 + r4]
-    pmovsxwd        m3, [r1 + r4 + mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    movu            [r0 + 6 * mmsize], m2
-    movu            [r0 + 7 * mmsize], m3
-
-    add             r0, 8 * mmsize
+    ; Row 0-1
+    mova            m2, [r1]
+    mova            m3, [r1 + r2]
+    psubw           m2, m1
+    psubw           m3, m1
+    psraw           m2, m0
+    psraw           m3, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
+
+    ; Row 2-3
+    mova            m2, [r1 + r2 * 2]
+    mova            m3, [r1 + r4]
+    psubw           m2, m1
+    psubw           m3, m1
+    psraw           m2, m0
+    psraw           m3, m0
+    mova            [r0 + 2 * mmsize], m2
+    mova            [r0 + 3 * mmsize], m3
+
+    add             r0, 4 * mmsize
     lea             r1, [r1 + r2 * 4]
     dec             r3d
     jnz            .loop
@@ -3823,62 +3753,47 @@
 
 
 ;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 ;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_16, 3,4,6
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_16, 3, 4, 4
     add             r2d, r2d
     movd            m0, r3m
-    movd            m1, r4m
-    pshufd          m1, m1, 0
+    pcmpeqw	    m1, m1
+    psllw           m1, m0
+    psraw           m1, 1
     mov             r3d, 16/2
 
     ; register alloc
     ; r0 - dst
     ; r1 - src
-    ; r2 - stride
+    ; r2 - srcStride
     ; r3 - loop counter
     ; m0 - shift
-    ; m1 - dword [offset]
+    ; m1 - word [-round]
 
 .loop:
     ; Row 0
-    pmovsxwd        m2, [r1 + 0 * mmsize/2]
-    pmovsxwd        m3, [r1 + 1 * mmsize/2]
-    pmovsxwd        m4, [r1 + 2 * mmsize/2]
-    pmovsxwd        m5, [r1 + 3 * mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    paddd           m4, m1
-    paddd           m5, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    psrad           m4, m0
-    psrad           m5, m0
-    movu            [r0 + 0 * mmsize], m2
-    movu            [r0 + 1 * mmsize], m3
-    movu            [r0 + 2 * mmsize], m4
-    movu            [r0 + 3 * mmsize], m5
+    mova            m2, [r1 + 0 * mmsize]
+    mova            m3, [r1 + 1 * mmsize]
+    psubw           m2, m1
+    psubw           m3, m1
+    psraw           m2, m0
+    psraw           m3, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
 
     ; Row 1
-    pmovsxwd        m2, [r1 + r2 + 0 * mmsize/2]
-    pmovsxwd        m3, [r1 + r2 +1 * mmsize/2]
-    pmovsxwd        m4, [r1 + r2 +2 * mmsize/2]
-    pmovsxwd        m5, [r1 + r2 +3 * mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    paddd           m4, m1
-    paddd           m5, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    psrad           m4, m0
-    psrad           m5, m0
-    movu            [r0 + 4 * mmsize], m2
-    movu            [r0 + 5 * mmsize], m3
-    movu            [r0 + 6 * mmsize], m4
-    movu            [r0 + 7 * mmsize], m5
-
-    add             r0, 8 * mmsize
+    mova            m2, [r1 + r2 + 0 * mmsize]
+    mova            m3, [r1 + r2 + 1 * mmsize]
+    psubw           m2, m1
+    psubw           m3, m1
+    psraw           m2, m0
+    psraw           m3, m0
+    mova            [r0 + 2 * mmsize], m2
+    mova            [r0 + 3 * mmsize], m3
+
+    add             r0, 4 * mmsize
     lea             r1, [r1 + r2 * 2]
     dec             r3d
     jnz            .loop
@@ -3886,61 +3801,45 @@
 
 
 ;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 ;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_32, 3,4,6
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_32, 3, 4, 6
     add             r2d, r2d
     movd            m0, r3m
-    movd            m1, r4m
-    pshufd          m1, m1, 0
+    pcmpeqw	    m1, m1
+    psllw           m1, m0
+    psraw           m1, 1
     mov             r3d, 32/1
 
     ; register alloc
     ; r0 - dst
     ; r1 - src
-    ; r2 - stride
+    ; r2 - srcStride
     ; r3 - loop counter
     ; m0 - shift
-    ; m1 - dword [offset]
+    ; m1 - word [-round]
 
 .loop:
     ; Row 0
-    pmovsxwd        m2, [r1 + 0 * mmsize/2]
-    pmovsxwd        m3, [r1 + 1 * mmsize/2]
-    pmovsxwd        m4, [r1 + 2 * mmsize/2]
-    pmovsxwd        m5, [r1 + 3 * mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    paddd           m4, m1
-    paddd           m5, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    psrad           m4, m0
-    psrad           m5, m0
-    movu            [r0 + 0 * mmsize], m2
-    movu            [r0 + 1 * mmsize], m3
-    movu            [r0 + 2 * mmsize], m4
-    movu            [r0 + 3 * mmsize], m5
-
-    pmovsxwd        m2, [r1 + 4 * mmsize/2]
-    pmovsxwd        m3, [r1 + 5 * mmsize/2]
-    pmovsxwd        m4, [r1 + 6 * mmsize/2]
-    pmovsxwd        m5, [r1 + 7 * mmsize/2]
-    paddd           m2, m1
-    paddd           m3, m1
-    paddd           m4, m1
-    paddd           m5, m1
-    psrad           m2, m0
-    psrad           m3, m0
-    psrad           m4, m0
-    psrad           m5, m0
-    movu            [r0 + 4 * mmsize], m2
-    movu            [r0 + 5 * mmsize], m3
-    movu            [r0 + 6 * mmsize], m4
-    movu            [r0 + 7 * mmsize], m5
-
-    add             r0, 8 * mmsize
+    mova            m2, [r1 + 0 * mmsize]
+    mova            m3, [r1 + 1 * mmsize]
+    mova            m4, [r1 + 2 * mmsize]
+    mova            m5, [r1 + 3 * mmsize]
+    psubw           m2, m1
+    psubw           m3, m1
+    psubw           m4, m1
+    psubw           m5, m1
+    psraw           m2, m0
+    psraw           m3, m0
+    psraw           m4, m0
+    psraw           m5, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
+    mova            [r0 + 2 * mmsize], m4
+    mova            [r0 + 3 * mmsize], m5
+
+    add             r0, 4 * mmsize
     add             r1, r2
     dec             r3d
     jnz            .loop
@@ -3948,58 +3847,239 @@
 
 
 ;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal cvt32to16_shl_4, 3,3,5
+cglobal cpy1Dto2D_shl_4, 3, 3, 3
     add         r2d, r2d
     movd        m0, r3m
 
     ; Row 0-3
+    mova        m1, [r1 + 0 * mmsize]
+    mova        m2, [r1 + 1 * mmsize]
+    psllw       m1, m0
+    psllw       m2, m0
+    movh        [r0], m1
+    movhps      [r0 + r2], m1
+    movh        [r0 + r2 * 2], m2
+    lea         r2, [r2 * 3]
+    movhps      [r0 + r2], m2
+    RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shl_4, 3, 3, 2
+    add         r2d, r2d
+    movd        xm0, r3m
+
+    ; Row 0-3
+    movu        m1, [r1]
+    psllw       m1, xm0
+    vextracti128 xm0, m1, 1
+    movq        [r0], xm1
+    movhps      [r0 + r2], xm1
+    lea         r0, [r0 + r2 * 2]
+    movq        [r0], xm0
+    movhps      [r0 + r2], xm0
+    RET
+
+
+;--------------------------------------------------------------------------------------
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy1Dto2D_shl_8, 3, 4, 5
+    add         r2d, r2d
+    movd        m0, r3m
+    lea         r3, [r2 * 3]
+
+    ; Row 0-3
+    mova        m1, [r1 + 0 * mmsize]
+    mova        m2, [r1 + 1 * mmsize]
+    mova        m3, [r1 + 2 * mmsize]
+    mova        m4, [r1 + 3 * mmsize]
+    psllw       m1, m0
+    psllw       m2, m0
+    psllw       m3, m0
+    psllw       m4, m0
+    mova        [r0], m1
+    mova        [r0 + r2], m2
+    mova        [r0 + r2 * 2], m3
+    mova        [r0 + r3], m4
+    lea         r0, [r0 + r2 * 4]
+
+    ; Row 4-7
+    mova        m1, [r1 + 4 * mmsize]
+    mova        m2, [r1 + 5 * mmsize]
+    mova        m3, [r1 + 6 * mmsize]
+    mova        m4, [r1 + 7 * mmsize]
+    psllw       m1, m0
+    psllw       m2, m0
+    psllw       m3, m0
+    psllw       m4, m0
+    mova        [r0], m1
+    mova        [r0 + r2], m2
+    mova        [r0 + r2 * 2], m3
+    mova        [r0 + r3], m4
+    RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shl_8, 3, 4, 3
+    add         r2d, r2d
+    movd        xm0, r3m
+    lea         r3, [r2 * 3]
+
+    ; Row 0-3
     movu        m1, [r1 + 0 * mmsize]
     movu        m2, [r1 + 1 * mmsize]
-    movu        m3, [r1 + 2 * mmsize]
-    movu        m4, [r1 + 3 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
+    psllw       m1, xm0
+    psllw       m2, xm0
+    movu        [r0], xm1
+    vextracti128 [r0 + r2], m1, 1
+    movu        [r0 + r2 * 2], xm2
+    vextracti128 [r0 + r3], m2, 1
+
+    ; Row 4-7
+    movu        m1, [r1 + 2 * mmsize]
+    movu        m2, [r1 + 3 * mmsize]
+    lea         r0, [r0 + r2 * 4]
+    psllw       m1, xm0
+    psllw       m2, xm0
+    movu        [r0], xm1
+    vextracti128 [r0 + r2], m1, 1
+    movu        [r0 + r2 * 2], xm2
+    vextracti128 [r0 + r3], m2, 1
+    RET
+
+
+;--------------------------------------------------------------------------------------
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy1Dto2D_shl_16, 3, 4, 5
+    add         r2d, r2d
+    movd        m0, r3m
+    mov         r3d, 16/4
+
+.loop:
+    ; Row 0-1
+    mova        m1, [r1 + 0 * mmsize]
+    mova        m2, [r1 + 1 * mmsize]
+    mova        m3, [r1 + 2 * mmsize]
+    mova        m4, [r1 + 3 * mmsize]
     psllw       m1, m0
+    psllw       m2, m0
     psllw       m3, m0
-    movh        [r0], m1
-    movhps      [r0 + r2], m1
-    movh        [r0 + r2 * 2], m3
-    lea         r2, [r2 * 3]
-    movhps      [r0 + r2], m3
+    psllw       m4, m0
+    mova        [r0], m1
+    mova        [r0 + 16], m2
+    mova        [r0 + r2], m3
+    mova        [r0 + r2 + 16], m4
+
+    ; Row 2-3
+    mova        m1, [r1 + 4 * mmsize]
+    mova        m2, [r1 + 5 * mmsize]
+    mova        m3, [r1 + 6 * mmsize]
+    mova        m4, [r1 + 7 * mmsize]
+    lea         r0, [r0 + r2 * 2]
+    psllw       m1, m0
+    psllw       m2, m0
+    psllw       m3, m0
+    psllw       m4, m0
+    mova        [r0], m1
+    mova        [r0 + 16], m2
+    mova        [r0 + r2], m3
+    mova        [r0 + r2 + 16], m4
+
+    add         r1, 8 * mmsize
+    lea         r0, [r0 + r2 * 2]
+    dec         r3d
+    jnz        .loop
     RET
 
 
 INIT_YMM avx2
-cglobal cvt32to16_shl_4, 3,3,3
+cglobal cpy1Dto2D_shl_16, 3, 5, 3
     add         r2d, r2d
     movd        xm0, r3m
-
-    ; Row 0-3
+    mov         r3d, 16/4
+    lea         r4, [r2 * 3]
+
+.loop:
+    ; Row 0-1
     movu        m1, [r1 + 0 * mmsize]
     movu        m2, [r1 + 1 * mmsize]
-    packssdw    m1, m2
     psllw       m1, xm0
-    vextracti128 xm0, m1, 1
-    movq        [r0], xm1
-    movq        [r0 + r2], xm0
-    lea         r0, [r0 + r2 * 2]
-    movhps      [r0], xm1
-    movhps      [r0 + r2], xm0
+    psllw       m2, xm0
+    movu        [r0], m1
+    movu        [r0 + r2], m2
+
+    ; Row 2-3
+    movu        m1, [r1 + 2 * mmsize]
+    movu        m2, [r1 + 3 * mmsize]
+    psllw       m1, xm0
+    psllw       m2, xm0
+    movu        [r0 + r2 * 2], m1
+    movu        [r0 + r4], m2
+
+    add         r1, 4 * mmsize
+    lea         r0, [r0 + r2 * 4]
+    dec         r3d
+    jnz        .loop
     RET
 
 
 ;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal cvt32to16_shl_8, 3,5,5
+cglobal cpy1Dto2D_shl_32, 3, 4, 5
     add         r2d, r2d
     movd        m0, r3m
-    mov         r3d, 8/4
-    lea         r4, [r2 * 3]
+    mov         r3d, 32/2
+
+.loop:
+    ; Row 0
+    mova        m1, [r1 + 0 * mmsize]
+    mova        m2, [r1 + 1 * mmsize]
+    mova        m3, [r1 + 2 * mmsize]
+    mova        m4, [r1 + 3 * mmsize]
+    psllw       m1, m0
+    psllw       m2, m0
+    psllw       m3, m0
+    psllw       m4, m0
+    mova        [r0 + 0 * mmsize], m1
+    mova        [r0 + 1 * mmsize], m2
+    mova        [r0 + 2 * mmsize], m3
+    mova        [r0 + 3 * mmsize], m4
+
+    ; Row 1
+    mova        m1, [r1 + 4 * mmsize]
+    mova        m2, [r1 + 5 * mmsize]
+    mova        m3, [r1 + 6 * mmsize]
+    mova        m4, [r1 + 7 * mmsize]
+    psllw       m1, m0
+    psllw       m2, m0
+    psllw       m3, m0
+    psllw       m4, m0
+    mova        [r0 + r2 + 0 * mmsize], m1
+    mova        [r0 + r2 + 1 * mmsize], m2
+    mova        [r0 + r2 + 2 * mmsize], m3
+    mova        [r0 + r2 + 3 * mmsize], m4
+
+    add         r1, 8 * mmsize
+    lea         r0, [r0 + r2 * 2]
+    dec         r3d
+    jnz        .loop
+    RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shl_32, 3, 4, 5
+    add         r2d, r2d
+    movd        xm0, r3m
+    mov         r3d, 32/2
 
 .loop:
     ; Row 0-1
@@ -4007,252 +4087,14 @@
     movu        m2, [r1 + 1 * mmsize]
     movu        m3, [r1 + 2 * mmsize]
     movu        m4, [r1 + 3 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
-    psllw       m1, m0
-    psllw       m3, m0
+    psllw       m1, xm0
+    psllw       m2, xm0
+    psllw       m3, xm0
+    psllw       m4, xm0
     movu        [r0], m1
+    movu        [r0 + mmsize], m2
     movu        [r0 + r2], m3
-
-    ; Row 2-3
-    movu        m1, [r1 + 4 * mmsize]
-    movu        m2, [r1 + 5 * mmsize]
-    movu        m3, [r1 + 6 * mmsize]
-    movu        m4, [r1 + 7 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
-    psllw       m1, m0
-    psllw       m3, m0
-    movu        [r0 + r2 * 2], m1
-    movu        [r0 + r4], m3
-
-    add         r1, 8 * mmsize
-    lea         r0, [r0 + r2 * 4]
-    dec         r3d
-    jnz        .loop
-    RET
-
-
-INIT_YMM avx2
-cglobal cvt32to16_shl_8, 3,4,3
-    add         r2d, r2d
-    movd        xm0, r3m
-    lea         r3, [r2 * 3]
-
-    ; Row 0-1
-    movu        xm1, [r1 + 0 * mmsize]
-    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
-    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
-    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
-    packssdw    m1, m2
-    psllw       m1, xm0
-    movu        [r0], xm1
-    vextracti128 [r0 + r2], m1, 1
-
-    ; Row 2-3
-    movu        xm1, [r1 + 2 * mmsize]
-    vinserti128  m1, m1, [r1 + 3 * mmsize], 1
-    movu        xm2, [r1 + 2 * mmsize + mmsize/2]
-    vinserti128  m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
-    packssdw    m1, m2
-    psllw       m1, xm0
-    movu        [r0 + r2 * 2], xm1
-    vextracti128 [r0 + r3], m1, 1
-
-    add         r1, 4 * mmsize
-    lea         r0, [r0 + r2 * 4]
-
-    ; Row 4-5
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    packssdw    m1, m2
-    vpermq      m1, m1, 11011000b
-    psllw       m1, xm0
-    movu        [r0], xm1
-    vextracti128 [r0 + r2], m1, 1
-
-    ; Row 6-7
-    movu        m1, [r1 + 2 * mmsize]
-    movu        m2, [r1 + 3 * mmsize]
-    packssdw    m1, m2
-    vpermq      m1, m1, 11011000b
-    psllw       m1, xm0
-    movu        [r0 + r2 * 2], xm1
-    vextracti128 [r0 + r3], m1, 1
-    RET
-
-;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
-;--------------------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal cvt32to16_shl_16, 3,4,5
-    add         r2d, r2d
-    movd        m0, r3m
-    mov         r3d, 16/2
-
-.loop:
-    ; Row 0
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    movu        m3, [r1 + 2 * mmsize]
-    movu        m4, [r1 + 3 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
-    psllw       m1, m0
-    psllw       m3, m0
-    movu        [r0], m1
-    movu        [r0 + mmsize], m3
-
-    ; Row 1
-    movu        m1, [r1 + 4 * mmsize]
-    movu        m2, [r1 + 5 * mmsize]
-    movu        m3, [r1 + 6 * mmsize]
-    movu        m4, [r1 + 7 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
-    psllw       m1, m0
-    psllw       m3, m0
-    movu        [r0 + r2], m1
-    movu        [r0 + r2 + mmsize], m3
-
-    add         r1, 8 * mmsize
-    lea         r0, [r0 + r2 * 2]
-    dec         r3d
-    jnz        .loop
-    RET
-
-
-INIT_YMM avx2
-cglobal cvt32to16_shl_16, 3,5,3
-    add         r2d, r2d
-    movd        xm0, r3m
-    mov         r3d, 16/4
-    lea         r4, [r2 * 3]
-
-.loop:
-    ; Row 0
-    movu        xm1, [r1 + 0 * mmsize]
-    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
-    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
-    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
-    packssdw    m1, m2
-    psllw       m1, xm0
-    movu        [r0], m1
-
-    ; Row 1
-    movu        xm1, [r1 + 2 * mmsize]
-    vinserti128  m1, m1, [r1 + 3 * mmsize], 1
-    movu        xm2, [r1 + 2 * mmsize + mmsize/2]
-    vinserti128  m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
-    packssdw    m1, m2
-    psllw       m1, xm0
-    movu        [r0 + r2], m1
-
-    add         r1, 4 * mmsize
-
-    ; Row 2
-    movu        xm1, [r1 + 0 * mmsize]
-    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
-    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
-    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
-    packssdw    m1, m2
-    psllw       m1, xm0
-    movu        [r0 + r2 * 2], m1
-
-    ; Row 3
-    movu        m1, [r1 + 2 * mmsize]
-    movu        m2, [r1 + 3 * mmsize]
-    packssdw    m1, m2
-    psllw       m1, xm0
-    vpermq      m1, m1, 11011000b
-    movu        [r0 + r4], m1
-
-    add         r1, 4 * mmsize
-    lea         r0, [r0 + r2 * 4]
-    dec         r3d
-    jnz        .loop
-    RET
-
-
-;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
-;--------------------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal cvt32to16_shl_32, 3,4,5
-    add         r2d, r2d
-    movd        m0, r3m
-    mov         r3d, 32/1
-
-.loop:
-    ; Row 0
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    movu        m3, [r1 + 2 * mmsize]
-    movu        m4, [r1 + 3 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
-    psllw       m1, m0
-    psllw       m3, m0
-    movu        [r0 + 0 * mmsize], m1
-    movu        [r0 + 1 * mmsize], m3
-
-    movu        m1, [r1 + 4 * mmsize]
-    movu        m2, [r1 + 5 * mmsize]
-    movu        m3, [r1 + 6 * mmsize]
-    movu        m4, [r1 + 7 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
-    psllw       m1, m0
-    psllw       m3, m0
-    movu        [r0 + 2 * mmsize], m1
-    movu        [r0 + 3 * mmsize], m3
-
-    add         r1, 8 * mmsize
-    add         r0, r2
-    dec         r3d
-    jnz        .loop
-    RET
-
-
-INIT_YMM avx2
-cglobal cvt32to16_shl_32, 3,4,5
-    add         r2d, r2d
-    movd        xm0, r3m
-    mov         r3d, 32/2
-
-.loop:
-    ; Row 0
-    movu        xm1, [r1 + 0 * mmsize]
-    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
-    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
-    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
-    movu        xm3, [r1 + 2 * mmsize]
-    vinserti128  m3, m3, [r1 + 3 * mmsize], 1
-    movu        xm4, [r1 + 2 * mmsize + mmsize/2]
-    vinserti128  m4, m4, [r1 + 3 * mmsize + mmsize/2], 1
-    packssdw    m1, m2
-    packssdw    m3, m4
-    psllw       m1, xm0
-    psllw       m3, xm0
-    movu        [r0], m1
-    movu        [r0 + mmsize], m3
-
-    add         r1, 4 * mmsize
-
-    ; Row 1
-    movu        xm1, [r1 + 0 * mmsize]
-    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
-    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
-    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
-    movu        m3, [r1 + 2 * mmsize]
-    movu        m4, [r1 + 3 * mmsize]
-    packssdw    m1, m2
-    packssdw    m3, m4
-    psllw       m1, xm0
-    psllw       m3, xm0
-    vpermq      m3, m3, 11011000b
-    movu        [r0 + r2], m1
-    movu        [r0 + r2 + mmsize], m3
+    movu        [r0 + r2 + mmsize], m4
 
     add         r1, 4 * mmsize
     lea         r0, [r0 + r2 * 2]
@@ -4262,7 +4104,7 @@
 
 
 ;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal copy_cnt_4, 3,3,3
@@ -4301,7 +4143,7 @@
 
 
 ;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal copy_cnt_8, 3,3,6
@@ -4405,7 +4247,7 @@
 
 
 ;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal copy_cnt_16, 3,4,6
@@ -4516,7 +4358,7 @@
     RET
 
 ;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal copy_cnt_32, 3,4,6
@@ -4623,180 +4465,432 @@
     movd         eax, xm4
     RET
 
-;-----------------------------------------------------------------------------
-; void copy_shr(short *dst, short *src, intptr_t stride, int shift, int size)
-;-----------------------------------------------------------------------------
-
-INIT_XMM sse4
-cglobal copy_shr, 4, 7, 4, dst, src, stride
-%define rnd     m2
-%define shift   m1
-
-    ; make shift
-    mov         r5d, r3m
-    movd        shift, r5d
-
-    ; make round
-    dec         r5
-    xor         r6, r6
-    bts         r6, r5
-
-    movd        rnd, r6d
-    pshufd      rnd, rnd, 0
+
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_4, 4, 4, 4
+    add             r2d, r2d
+    movd            m0, r3d
 
     ; register alloc
     ; r0 - dst
     ; r1 - src
-    ; r2 - stride * 2 (short*)
-    ; r3 - lx
-    ; r4 - size
-    ; r5 - ly
-    ; r6 - diff
-    add         r2d, r2d
-
-    mov         r4d, r4m
-    mov         r5, r4 ; size
-    mov         r6, r2 ; stride
-    sub         r6, r4
-    add         r6, r6
-
-    shr         r5, 1
-.loop_row:
-
-    mov         r3, r4
-    shr         r3, 2
-.loop_col:
-    ; row 0
-    movh        m3, [r1]
-    pmovsxwd    m0, m3
-    paddd       m0, rnd
-    psrad       m0, shift
-    packssdw    m0, m0
-    movh        [r0], m0
-
-    ; row 1
-    movh        m3, [r1 + r4 * 2]
-    pmovsxwd    m0, m3
-    paddd       m0, rnd
-    psrad       m0, shift
-    packssdw    m0, m0
-    movh        [r0 + r2], m0
-
-    ; move col pointer
-    add         r1, 8
-    add         r0, 8
-
-    dec         r3
-    jg          .loop_col
-
-    ; update pointer
-    lea         r1, [r1 + r4 * 2]
-    add         r0, r6
-
-    ; end of loop_row
-    dec         r5
-    jg         .loop_row
+    ; r2 - srcStride
+    ; m0 - shift
+
+    ; Row 0-3
+    movh            m2, [r1]
+    movhps          m2, [r1 + r2]
+    lea             r1, [r1 + r2 * 2]
+    movh            m3, [r1]
+    movhps          m3, [r1 + r2]
+    psllw           m2, m0
+    psllw           m3, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
 
     RET
 
+
 ;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal copy_shl_4, 3,3,3
+cglobal cpy2Dto1D_shl_8, 4, 5, 4
+    add             r2d, r2d
+    movd            m0, r3d
+    mov             r3d, 8/4
+    lea             r4, [r2 * 3]
+
+    ; register alloc
+    ; r0 - dst
+    ; r1 - src
+    ; r2 - srcStride
+    ; r3 - loop counter
+    ; r4 - stride * 3
+    ; m0 - shift
+
+.loop:
+    ; Row 0, 1
+    mova            m2, [r1]
+    mova            m3, [r1 + r2]
+    psllw           m2, m0
+    psllw           m3, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
+
+    ; Row 2, 3
+    mova            m2, [r1 + r2 * 2]
+    mova            m3, [r1 + r4]
+    psllw           m2, m0
+    psllw           m3, m0
+    mova            [r0 + 2 * mmsize], m2
+    mova            [r0 + 3 * mmsize], m3
+
+    add             r0, 4 * mmsize
+    lea             r1, [r1 + r2 * 4]
+    dec             r3d
+    jnz            .loop
+    RET
+
+
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_16, 4, 4, 4
+    add             r2d, r2d
+    movd            m0, r3d
+    mov             r3d, 16/2
+
+    ; register alloc
+    ; r0 - dst
+    ; r1 - src
+    ; r2 - srcStride
+    ; r3 - loop counter
+    ; m0 - shift
+
+.loop:
+    ; Row 0
+    mova            m2, [r1 + 0 * mmsize]
+    mova            m3, [r1 + 1 * mmsize]
+    psllw           m2, m0
+    psllw           m3, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
+
+    ; Row 1
+    mova            m2, [r1 + r2 + 0 * mmsize]
+    mova            m3, [r1 + r2 + 1 * mmsize]
+    psllw           m2, m0
+    psllw           m3, m0
+    mova            [r0 + 2 * mmsize], m2
+    mova            [r0 + 3 * mmsize], m3
+
+    add             r0, 4 * mmsize
+    lea             r1, [r1 + r2 * 2]
+    dec             r3d
+    jnz            .loop
+    RET
+
+
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_32, 4, 4, 6
+    add             r2d, r2d
+    movd            m0, r3d
+    mov             r3d, 32/1
+
+    ; register alloc
+    ; r0 - dst
+    ; r1 - src
+    ; r2 - srcStride
+    ; r3 - loop counter
+    ; m0 - shift
+
+.loop:
+    ; Row 0
+    mova            m2, [r1 + 0 * mmsize]
+    mova            m3, [r1 + 1 * mmsize]
+    mova            m4, [r1 + 2 * mmsize]
+    mova            m5, [r1 + 3 * mmsize]
+    psllw           m2, m0
+    psllw           m3, m0
+    psllw           m4, m0
+    psllw           m5, m0
+    mova            [r0 + 0 * mmsize], m2
+    mova            [r0 + 1 * mmsize], m3
+    mova            [r0 + 2 * mmsize], m4
+    mova            [r0 + 3 * mmsize], m5
+
+    add             r0, 4 * mmsize
+    add             r1, r2
+    dec             r3d
+    jnz            .loop
+    RET
+
+
+;--------------------------------------------------------------------------------------
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy1Dto2D_shr_4, 3, 3, 4
     add         r2d, r2d
     movd        m0, r3m
+    pcmpeqw	m1, m1
+    psllw       m1, m0
+    psraw       m1, 1
 
     ; Row 0-3
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    psllw       m1, m0
-    psllw       m2, m0
-    movh        [r0], m1
-    movhps      [r0 + r2], m1
-    movh        [r0 + r2 * 2], m2
+    mova        m2, [r1 + 0 * mmsize]
+    mova        m3, [r1 + 1 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psraw       m2, m0
+    psraw       m3, m0
+    movh        [r0], m2
+    movhps      [r0 + r2], m2
+    movh        [r0 + r2 * 2], m3
     lea         r2, [r2 * 3]
-    movhps      [r0 + r2], m2
+    movhps      [r0 + r2], m3
     RET
 
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_4, 3, 3, 3
+    add         r2d, r2d
+    movd        xm0, r3m
+    pcmpeqw	m1, m1
+    psllw       m1, xm0
+    psraw       m1, 1
+
+    ; Row 0-3
+    movu        m2, [r1]
+    psubw       m2, m1
+    psraw       m2, xm0
+    vextracti128 xm1, m2, 1
+    movq        [r0], xm2
+    movhps      [r0 + r2], xm2
+    lea         r0, [r0 + r2 * 2]
+    movq        [r0], xm1
+    movhps      [r0 + r2], xm1
+    RET
+
+
 ;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal copy_shl_8, 3,4,5
+cglobal cpy1Dto2D_shr_8, 3, 4, 6
     add         r2d, r2d
     movd        m0, r3m
+    pcmpeqw	m1, m1
+    psllw       m1, m0
+    psraw       m1, 1
+    lea         r3, [r2 * 3]
 
     ; Row 0-3
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    movu        m3, [r1 + 2 * mmsize]
-    movu        m4, [r1 + 3 * mmsize]
-    psllw       m1, m0
-    psllw       m2, m0
-    psllw       m3, m0
-    psllw       m4, m0
-    movu        [r0], m1
-    movu        [r0 + r2], m2
-    movu        [r0 + 2 * r2], m3
-    lea         r0, [r0 + 2 * r2]
-    movu        [r0 + r2], m4
+    mova        m2, [r1 + 0 * mmsize]
+    mova        m3, [r1 + 1 * mmsize]
+    mova        m4, [r1 + 2 * mmsize]
+    mova        m5, [r1 + 3 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psubw       m4, m1
+    psubw       m5, m1
+    psraw       m2, m0
+    psraw       m3, m0
+    psraw       m4, m0
+    psraw       m5, m0
+    mova        [r0], m2
+    mova        [r0 + r2], m3
+    mova        [r0 + r2 * 2], m4
+    mova        [r0 + r3], m5
 
     ; Row 4-7
-    movu        m1, [r1 + 4 * mmsize]
-    movu        m2, [r1 + 5 * mmsize]
-    movu        m3, [r1 + 6 * mmsize]
-    movu        m4, [r1 + 7 * mmsize]
-    psllw       m1, m0
-    psllw       m2, m0
-    psllw       m3, m0
-    psllw       m4, m0
-    movu        [r0 + r2 * 2], m1
-    lea         r0, [r0 + 2 * r2]
-    movu        [r0 + r2], m2
-    movu        [r0 + 2 * r2], m3
-    lea         r0, [r0 + 2 * r2]
-    movu        [r0 + r2], m4
+    mova        m2, [r1 + 4 * mmsize]
+    mova        m3, [r1 + 5 * mmsize]
+    mova        m4, [r1 + 6 * mmsize]
+    mova        m5, [r1 + 7 * mmsize]
+    lea         r0, [r0 + r2 * 4]
+    psubw       m2, m1
+    psubw       m3, m1
+    psubw       m4, m1
+    psubw       m5, m1
+    psraw       m2, m0
+    psraw       m3, m0
+    psraw       m4, m0
+    psraw       m5, m0
+    mova        [r0], m2
+    mova        [r0 + r2], m3
+    mova        [r0 + r2 * 2], m4
+    mova        [r0 + r3], m5
     RET
 
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_8, 3, 4, 4
+    add         r2d, r2d
+    movd        xm0, r3m
+    pcmpeqw	m1, m1
+    psllw       m1, xm0
+    psraw       m1, 1
+    lea         r3, [r2 * 3]
+
+    ; Row 0-3
+    movu        m2, [r1 + 0 * mmsize]
+    movu        m3, [r1 + 1 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psraw       m2, xm0
+    psraw       m3, xm0
+    movu        [r0], xm2
+    vextracti128 [r0 + r2], m2, 1
+    movu        [r0 + r2 * 2], xm3
+    vextracti128 [r0 + r3], m3, 1
+
+    ; Row 4-7
+    movu        m2, [r1 + 2 * mmsize]
+    movu        m3, [r1 + 3 * mmsize]
+    lea         r0, [r0 + r2 * 4]
+    psubw       m2, m1
+    psubw       m3, m1
+    psraw       m2, xm0
+    psraw       m3, xm0
+    movu        [r0], xm2
+    vextracti128 [r0 + r2], m2, 1
+    movu        [r0 + r2 * 2], xm3
+    vextracti128 [r0 + r3], m3, 1
+    RET
+
+
 ;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal copy_shl_16, 3,4,5
+cglobal cpy1Dto2D_shr_16, 3, 5, 6
     add         r2d, r2d
     movd        m0, r3m
-    mov         r3d, 256/64
+    pcmpeqw	m1, m1
+    psllw       m1, m0
+    psraw       m1, 1
+    mov         r3d, 16/4
+    lea         r4, [r2 * 3]
 
 .loop:
-    ; Row 0-3
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    movu        m3, [r1 + 2 * mmsize]
-    movu        m4, [r1 + 3 * mmsize]
+    ; Row 0-1
+    mova        m2, [r1 + 0 * mmsize]
+    mova        m3, [r1 + 1 * mmsize]
+    mova        m4, [r1 + 2 * mmsize]
+    mova        m5, [r1 + 3 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psubw       m4, m1
+    psubw       m5, m1
+    psraw       m2, m0
+    psraw       m3, m0
+    psraw       m4, m0
+    psraw       m5, m0
+    mova        [r0], m2
+    mova        [r0 + mmsize], m3
+    mova        [r0 + r2], m4
+    mova        [r0 + r2 + mmsize], m5
+
+    ; Row 2-3
+    mova        m2, [r1 + 4 * mmsize]
+    mova        m3, [r1 + 5 * mmsize]
+    mova        m4, [r1 + 6 * mmsize]
+    mova        m5, [r1 + 7 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psubw       m4, m1
+    psubw       m5, m1
+    psraw       m2, m0
+    psraw       m3, m0
+    psraw       m4, m0
+    psraw       m5, m0
+    mova        [r0 + r2 * 2], m2
+    mova        [r0 + r2 * 2 + mmsize], m3
+    mova        [r0 + r4], m4
+    mova        [r0 + r4 + mmsize], m5
+
+    add         r1, 8 * mmsize
+    lea         r0, [r0 + r2 * 4]
+    dec         r3d
+    jnz        .loop
+    RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_16, 3, 5, 4
+    add         r2d, r2d
+    movd        xm0, r3m
+    pcmpeqw	m1, m1
+    psllw       m1, xm0
+    psraw       m1, 1
+    mov         r3d, 16/4
+    lea         r4, [r2 * 3]
+
+.loop:
+    ; Row 0-1
+    movu        m2, [r1 + 0 * mmsize]
+    movu        m3, [r1 + 1 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psraw       m2, xm0
+    psraw       m3, xm0
+    movu        [r0], m2
+    movu        [r0 + r2], m3
+
+    ; Row 2-3
+    movu        m2, [r1 + 2 * mmsize]
+    movu        m3, [r1 + 3 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psraw       m2, xm0
+    psraw       m3, xm0
+    movu        [r0 + r2 * 2], m2
+    movu        [r0 + r4], m3
+
+    add         r1, 4 * mmsize
+    lea         r0, [r0 + r2 * 4]
+    dec         r3d
+    jnz        .loop
+    RET
+
+
+;--------------------------------------------------------------------------------------
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy1Dto2D_shr_32, 3, 4, 6
+    add         r2d, r2d
+    movd        m0, r3m
+    pcmpeqw	m1, m1
     psllw       m1, m0
-    psllw       m2, m0
-    psllw       m3, m0
-    psllw       m4, m0
-    movu        [r0], m1
-    movu        [r0 + 16], m2
-    movu        [r0 + r2], m3
-    movu        [r0 + r2 + 16], m4
-
-    ; Row 4-7
-    movu        m1, [r1 + 4 * mmsize]
-    movu        m2, [r1 + 5 * mmsize]
-    movu        m3, [r1 + 6 * mmsize]
-    movu        m4, [r1 + 7 * mmsize]
-    psllw       m1, m0
-    psllw       m2, m0
-    psllw       m3, m0
-    psllw       m4, m0
-    movu        [r0 + r2 * 2], m1
-    movu        [r0 + r2 * 2 + 16], m2
-    lea         r0, [r0 + r2 * 2]
-    movu        [r0 + r2], m3
-    movu        [r0 + r2 + 16], m4
+    psraw       m1, 1
+    mov         r3d, 32/2
+
+.loop:
+    ; Row 0
+    mova        m2, [r1 + 0 * mmsize]
+    mova        m3, [r1 + 1 * mmsize]
+    mova        m4, [r1 + 2 * mmsize]
+    mova        m5, [r1 + 3 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psubw       m4, m1
+    psubw       m5, m1
+    psraw       m2, m0
+    psraw       m3, m0
+    psraw       m4, m0
+    psraw       m5, m0
+    mova        [r0 + 0 * mmsize], m2
+    mova        [r0 + 1 * mmsize], m3
+    mova        [r0 + 2 * mmsize], m4
+    mova        [r0 + 3 * mmsize], m5
+
+    ; Row 1
+    mova        m2, [r1 + 4 * mmsize]
+    mova        m3, [r1 + 5 * mmsize]
+    mova        m4, [r1 + 6 * mmsize]
+    mova        m5, [r1 + 7 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psubw       m4, m1
+    psubw       m5, m1
+    psraw       m2, m0
+    psraw       m3, m0
+    psraw       m4, m0
+    psraw       m5, m0
+    mova        [r0 + r2 + 0 * mmsize], m2
+    mova        [r0 + r2 + 1 * mmsize], m3
+    mova        [r0 + r2 + 2 * mmsize], m4
+    mova        [r0 + r2 + 3 * mmsize], m5
 
     add         r1, 8 * mmsize
     lea         r0, [r0 + r2 * 2]
@@ -4804,45 +4898,36 @@
     jnz        .loop
     RET
 
-;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
-;--------------------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal copy_shl_32, 3,4,5
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_32, 3, 4, 6
     add         r2d, r2d
-    movd        m0, r3m
-    mov         r3d, 1024/64
+    movd        xm0, r3m
+    pcmpeqw	m1, m1
+    psllw       m1, xm0
+    psraw       m1, 1
+    mov         r3d, 32/2
 
 .loop:
-    ; Row 0-3
-    movu        m1, [r1 + 0 * mmsize]
-    movu        m2, [r1 + 1 * mmsize]
-    movu        m3, [r1 + 2 * mmsize]
-    movu        m4, [r1 + 3 * mmsize]
-    psllw       m1, m0
-    psllw       m2, m0
-    psllw       m3, m0
-    psllw       m4, m0
-    movu        [r0], m1
-    movu        [r0 + 16], m2
-    movu        [r0 + 32], m3
-    movu        [r0 + 48], m4
-
-    ; Row 4-7
-    movu        m1, [r1 + 4 * mmsize]
-    movu        m2, [r1 + 5 * mmsize]
-    movu        m3, [r1 + 6 * mmsize]
-    movu        m4, [r1 + 7 * mmsize]
-    psllw       m1, m0
-    psllw       m2, m0
-    psllw       m3, m0
-    psllw       m4, m0
-    movu        [r0 + r2], m1
-    movu        [r0 + r2 + 16], m2
-    movu        [r0 + r2 + 32], m3
-    movu        [r0 + r2 + 48], m4
-
-    add         r1, 8 * mmsize
+    ; Row 0-1
+    movu        m2, [r1 + 0 * mmsize]
+    movu        m3, [r1 + 1 * mmsize]
+    movu        m4, [r1 + 2 * mmsize]
+    movu        m5, [r1 + 3 * mmsize]
+    psubw       m2, m1
+    psubw       m3, m1
+    psubw       m4, m1
+    psubw       m5, m1
+    psraw       m2, xm0
+    psraw       m3, xm0
+    psraw       m4, xm0
+    psraw       m5, xm0
+    movu        [r0], m2
+    movu        [r0 + mmsize], m3
+    movu        [r0 + r2], m4
+    movu        [r0 + r2 + mmsize], m5
+
+    add         r1, 4 * mmsize
     lea         r0, [r0 + r2 * 2]
     dec         r3d
     jnz        .loop
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/x86/blockcopy8.h	Thu Nov 27 10:12:03 2014 +0900
@@ -24,32 +24,38 @@
 #ifndef X265_BLOCKCOPY8_H
 #define X265_BLOCKCOPY8_H
 
-void x265_cvt32to16_shl_4_sse2(int16_t* dst, const int* src, intptr_t, int);
-void x265_cvt32to16_shl_8_sse2(int16_t* dst, const int* src, intptr_t, int);
-void x265_cvt32to16_shl_16_sse2(int16_t* dst, const int* src, intptr_t, int);
-void x265_cvt32to16_shl_32_sse2(int16_t* dst, const int* src, intptr_t, int);
-void x265_cvt32to16_shl_4_avx2(int16_t* dst, const int* src, intptr_t, int);
-void x265_cvt32to16_shl_8_avx2(int16_t* dst, const int* src, intptr_t, int);
-void x265_cvt32to16_shl_16_avx2(int16_t* dst, const int* src, intptr_t, int);
-void x265_cvt32to16_shl_32_avx2(int16_t* dst, const int* src, intptr_t, int);
-void x265_copy16to16_shl_sse2(int16_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_4_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_8_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_16_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_32_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
-void x265_copy_shr_sse4(int16_t* dst, const int16_t* src, intptr_t, int, int);
-void x265_copy_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
-void x265_copy_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
-void x265_copy_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
-void x265_copy_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
-uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t);
-uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t);
-uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t);
-uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t);
-uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t);
-uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t);
-uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t);
-uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t);
+void x265_cpy2Dto1D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
 
 #define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
     void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
@@ -181,17 +187,17 @@
 void x265_blockfill_s_8x8_sse2(int16_t* dst, intptr_t dstride, int16_t val);
 void x265_blockfill_s_16x16_sse2(int16_t* dst, intptr_t dstride, int16_t val);
 void x265_blockfill_s_32x32_sse2(int16_t* dst, intptr_t dstride, int16_t val);
-void x265_blockcopy_ss_16x4_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_16x8_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_16x12_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_16x16_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_16x24_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_16x32_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_16x64_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_64x16_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_64x32_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_64x48_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_64x64_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
+void x265_blockcopy_ss_16x4_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x12_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
 
 void x265_blockcopy_pp_32x8_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
 void x265_blockcopy_pp_32x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/x86/dct8.asm	Thu Nov 27 10:12:03 2014 +0900
@@ -318,7 +318,7 @@
 cextern pw_ppppmmmm
 
 ;------------------------------------------------------
-;void dct4(int16_t *src, int16_t *dst, intptr_t stride)
+;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
 ;------------------------------------------------------
 INIT_XMM sse2
 cglobal dct4, 3, 4, 8
@@ -475,7 +475,7 @@
     RET
 
 ;-------------------------------------------------------
-;void idct4(int16_t *src, int16_t *dst, intptr_t stride)
+;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
 ;-------------------------------------------------------
 INIT_XMM sse2
 cglobal idct4, 3, 4, 7
@@ -565,7 +565,7 @@
     RET
 
 ;------------------------------------------------------
-;void dst4(int16_t *src, int16_t *dst, intptr_t stride)
+;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
 ;------------------------------------------------------
 INIT_XMM ssse3
 %if ARCH_X86_64
@@ -657,7 +657,7 @@
     RET
 
 ;-------------------------------------------------------
-;void idst4(int16_t *src, int16_t *dst, intptr_t stride)
+;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
 ;-------------------------------------------------------
 INIT_XMM sse2
 cglobal idst4, 3, 4, 7
@@ -750,7 +750,7 @@
 
 
 ;-------------------------------------------------------
-; void dct8(int16_t *src, int16_t *dst, intptr_t stride)
+; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
 ;-------------------------------------------------------
 INIT_XMM sse4
 cglobal dct8, 3,6,7,0-16*mmsize
@@ -974,7 +974,7 @@
     RET
 
 ;-------------------------------------------------------
-; void idct8(int16_t *src, int16_t *dst, intptr_t stride)
+; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
 ;-------------------------------------------------------
 INIT_XMM ssse3
 
@@ -1164,7 +1164,7 @@
 
 
 ;-----------------------------------------------------------------------------
-; void denoise_dct(int16_t *dct, uint32_t *sum, uint16_t *offset, int size)
+; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal denoise_dct, 4, 4, 6
@@ -2106,7 +2106,7 @@
 %endmacro
 
 ;-------------------------------------------------------
-; void idct16(int16_t *src, int16_t *dst, intptr_t stride)
+; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
 ;-------------------------------------------------------
 INIT_YMM avx2
 cglobal idct16, 3, 7, 16, 0-16*mmsize
@@ -2385,7 +2385,7 @@
 %endmacro
 
 ;-------------------------------------------------------
-; void idct32(int16_t *src, int16_t *dst, intptr_t stride)
+; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
 ;-------------------------------------------------------
 
 ; TODO: Reduce PHADDD instruction by PADDD
@@ -2684,7 +2684,7 @@
     RET
 
 ;-------------------------------------------------------
-; void idct4(int16_t *src, int16_t *dst, intptr_t stride)
+; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
 ;-------------------------------------------------------
 INIT_YMM avx2
 cglobal idct4, 3, 4, 6
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/x86/dct8.h	Thu Nov 27 10:12:03 2014 +0900
@@ -23,21 +23,21 @@
 
 #ifndef X265_DCT8_H
 #define X265_DCT8_H
-void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_dct16_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_idct32_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
+void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct16_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
 
-void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
+void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct32_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
 
 void x265_denoise_dct_sse4(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
 void x265_denoise_dct_avx2(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
diff -r dfe0803ae6be -r b4454aa1b6ab source/encoder/search.cpp
--- a/source/encoder/search.cpp	Wed Nov 26 16:56:00 2014 -0600
+++ b/source/encoder/search.cpp	Thu Nov 27 10:12:03 2014 +0900
@@ -2211,8 +2211,8 @@
             if (bTryZero)
             {
                 /* coincident blocks of the two reference pictures */
-                const pixel *ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
-                const pixel *ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+                const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+                const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
                 intptr_t refStride = slice->m_mref[0][0].lumaStride;
 
                 primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
diff -r dfe0803ae6be -r b4454aa1b6ab source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Wed Nov 26 16:56:00 2014 -0600
+++ b/source/test/pixelharness.cpp	Thu Nov 27 10:12:03 2014 +0900
@@ -344,60 +344,7 @@
     return true;
 }
 
-bool PixelHarness::check_copy16to16_shl_t(cpy16to16_shl_t ref, cpy16to16_shl_t opt)
-{
-    ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
-    ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
-
-    int j = 0;
-    intptr_t stride = STRIDE;
-    for (int i = 0; i < ITERS; i++)
-    {
-        int shift = (rand() % 7 + 1);
-
-        int index = i % TEST_CASES;
-        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
-        ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
-
-        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
-            return false;
-
-        reportfail();
-        j += INCR;
-    }
-
-    return true;
-}
-
-bool PixelHarness::check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt)
-{
-    ALIGN_VAR_16(int32_t, ref_dest[64 * 64]);
-    ALIGN_VAR_16(int32_t, opt_dest[64 * 64]);
-
-    memset(ref_dest, 0xCD, sizeof(ref_dest));
-    memset(opt_dest, 0xCD, sizeof(opt_dest));
-
-    int j = 0;
-    intptr_t stride = STRIDE;
-    for (int i = 0; i < ITERS; i++)
-    {
-        int shift = (rand() % 7 + 1);
-
-        int index = i % TEST_CASES;
-        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
-        ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
-
-        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
-            return false;
-
-        reportfail();
-        j += INCR;
-    }
-
-    return true;
-}
-
-bool PixelHarness::check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt)
+bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)
 {
     ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
     ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
@@ -412,8 +359,36 @@
         int shift = (rand() % 7 + 1);
 
         int index = i % TEST_CASES;
-        checked(opt, opt_dest, int_test_buff[index] + j, stride, shift);
-        ref(ref_dest, int_test_buff[index] + j, stride, shift);
+        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+        ref(ref_dest, short_test_buff[index] + j, stride, shift);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
+bool PixelHarness::check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt)
+{
+    ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
+    ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    int j = 0;
+    intptr_t stride = STRIDE;
+    for (int i = 0; i < ITERS; i++)
+    {
+        int shift = (rand() % 7 + 1);
+
+        int index = i % TEST_CASES;
+        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+        ref(ref_dest, short_test_buff[index] + j, stride, shift);
 
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
             return false;
@@ -451,7 +426,7 @@
     return true;
 }
 
-bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt)
+bool PixelHarness::check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
 {
     ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
     ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
@@ -466,8 +441,8 @@
         int shift = (rand() % 7 + 1);
 
         int index = i % TEST_CASES;
-        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
-        ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
+        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+        ref(ref_dest, short_test_buff[index] + j, stride, shift);
 
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
             return false;
@@ -479,7 +454,7 @@
     return true;
 }
 
-bool PixelHarness::check_copy_shl_t(copy_shl_t ref, copy_shl_t opt)
+bool PixelHarness::check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt)
 {
     ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
     ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
@@ -1280,41 +1255,40 @@
             }
         }
 
-        if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i])
+        if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shl[i])
         {
-            if (!check_cvt16to32_shr_t(ref.cvt16to32_shr[i], opt.cvt16to32_shr[i]))
+            if (!check_cpy2Dto1D_shl_t(ref.cpy2Dto1D_shl[i], opt.cpy2Dto1D_shl[i]))
             {
-                printf("cvt16to32_shr failed!\n");
+                printf("cpy2Dto1D_shl failed!\n");
                 return false;
             }
         }
 
-        if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i])
+        if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shr[i])
         {
-            if (!check_cvt32to16_shl_t(ref.cvt32to16_shl[i], opt.cvt32to16_shl[i]))
+            if (!check_cpy2Dto1D_shr_t(ref.cpy2Dto1D_shr[i], opt.cpy2Dto1D_shr[i]))
             {
-                printf("cvt32to16_shl failed!\n");
+                printf("cpy2Dto1D_shr failed!\n");
                 return false;
             }
         }
 
-        if ((i < BLOCK_64x64) && opt.copy_shl[i])
+        if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shl[i])
         {
-            if (!check_copy_shl_t(ref.copy_shl[i], opt.copy_shl[i]))
+            if (!check_cpy1Dto2D_shl_t(ref.cpy1Dto2D_shl[i], opt.cpy1Dto2D_shl[i]))
             {
-                printf("copy_shl[%dx%d] failed!\n", 4 << i, 4 << i);
+                printf("cpy1Dto2D_shl[%dx%d] failed!\n", 4 << i, 4 << i);
                 return false;
             }
         }
 
-    }
-
-    if (opt.cpy16to16_shl)
-    {
-        if (!check_copy16to16_shl_t(ref.cpy16to16_shl, opt.cpy16to16_shl))
+        if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shr[i])
         {
-            printf("copy16to16_shl failed!\n");
-            return false;
+            if (!check_cpy1Dto2D_shr_t(ref.cpy1Dto2D_shr[i], opt.cpy1Dto2D_shr[i]))
+            {
+                printf("cpy1Dto2D_shr[%dx%d] failed!\n", 4 << i, 4 << i);
+                return false;
+            }
         }
     }
 
@@ -1408,15 +1382,6 @@
         }
     }
 
-    if (opt.copy_shr)
-    {
-        if (!check_copy_shr_t(ref.copy_shr, opt.copy_shr))
-        {
-            printf("copy_shr failed!\n");
-            return false;
-        }
-    }
-
     return true;
 }
 
@@ -1637,16 +1602,28 @@
             REPORT_SPEEDUP(opt.var[i], ref.var[i], pbuf1, STRIDE);
         }
 
-        if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i])
+        if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shl[i])
         {
-            HEADER("cvt16to32_shr[%dx%d]", 4 << i, 4 << i);
-            REPORT_SPEEDUP(opt.cvt16to32_shr[i], ref.cvt16to32_shr[i], ibuf1, sbuf2, STRIDE, 3, 4);
+            HEADER("cpy2Dto1D_shl[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.cpy2Dto1D_shl[i], ref.cpy2Dto1D_shl[i], sbuf1, sbuf2, STRIDE, MAX_TR_DYNAMIC_RANGE - X265_DEPTH - (i + 2));
         }
 
-        if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i])
+        if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shr[i])
         {
-            HEADER("cvt32to16_shl[%dx%d]", 4 << i, 4 << i);
-            REPORT_SPEEDUP(opt.cvt32to16_shl[i], ref.cvt32to16_shl[i], sbuf2, ibuf1, STRIDE, 3);
+            HEADER("cpy2Dto1D_shr[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.cpy2Dto1D_shr[i], ref.cpy2Dto1D_shr[i], sbuf1, sbuf2, STRIDE, 3);
+        }
+
+        if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shl[i])
+        {
+            HEADER("cpy1Dto2D_shl[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.cpy1Dto2D_shl[i], ref.cpy1Dto2D_shl[i], sbuf1, sbuf2, STRIDE, 64);
+        }
+
+        if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shr[i])
+        {
+            HEADER("cpy1Dto2D_shr[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.cpy1Dto2D_shr[i], ref.cpy1Dto2D_shr[i], sbuf1, sbuf2, STRIDE, 64);
         }
 
         if ((i < BLOCK_64x64) && opt.copy_cnt[i])
@@ -1654,19 +1631,6 @@
             HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i);
             REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE);
         }
-
-        if ((i < BLOCK_64x64) && opt.copy_shl[i])
-        {
-            HEADER("copy_shl[%dx%d]", 4 << i, 4 << i);
-            REPORT_SPEEDUP(opt.copy_shl[i], ref.copy_shl[i], sbuf1, sbuf2, STRIDE, 64);
-        }
-
-    }
-
-    if (opt.cpy16to16_shl)
-    {
-        HEADER0("cpy16to16_shl");
-        REPORT_SPEEDUP(opt.cpy16to16_shl, ref.cpy16to16_shl, sbuf2, sbuf1, 64, 5, 64);
     }
 
     if (opt.weight_pp)
@@ -1728,11 +1692,4 @@
         HEADER0("planecopy_cp");
         REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1, 64, 64, 64, 2);
     }
-
-    if (opt.copy_shr)
-    {
-        HEADER0("copy_shr");
-        REPORT_SPEEDUP(opt.copy_shr, ref.copy_shr, sbuf1, sbuf2, 64, 5, 64);
-    }
-
 }
diff -r dfe0803ae6be -r b4454aa1b6ab source/test/pixelharness.h
--- a/source/test/pixelharness.h	Wed Nov 26 16:56:00 2014 -0600
+++ b/source/test/pixelharness.h	Thu Nov 27 10:12:03 2014 +0900
@@ -80,12 +80,11 @@
     bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);
     bool check_weightp(weightp_sp_t ref, weightp_sp_t opt);
     bool check_downscale_t(downscale_t ref, downscale_t opt);
-    bool check_copy16to16_shl_t(cpy16to16_shl_t ref, cpy16to16_shl_t opt);
-    bool check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt);
-    bool check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt);
+    bool check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt);
+    bool check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt);
+    bool check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
+    bool check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt);
     bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
-    bool check_copy_shr_t(copy_shr_t ref, copy_shr_t opt);
-    bool check_copy_shl_t(copy_shl_t ref, copy_shl_t opt);
     bool check_pixel_var(var_t ref, var_t opt);
     bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
     bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);


More information about the x265-devel mailing list