[x265] primitives: refactor tskip related
Steve Borho
steve at borho.org
Fri Nov 28 20:51:07 CET 2014
On 11/27, Satoshi Nakagawa wrote:
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1417050723 -32400
> # Thu Nov 27 10:12:03 2014 +0900
> # Node ID b4454aa1b6ab610c20241eb8fd5c73268b1ae3e0
> # Parent dfe0803ae6be925281cd6101fc0354a34bedfefd
> primitives: refactor tskip related
Pushed as two patches - one for the nits and the other for the copy
primitive API changes.
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/dct.cpp
> --- a/source/common/dct.cpp Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/dct.cpp Thu Nov 27 10:12:03 2014 +0900
> @@ -440,7 +440,7 @@
> }
> }
>
> -void dst4_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
> {
> const int shift_1st = 1 + X265_DEPTH - 8;
> const int shift_2nd = 8;
> @@ -450,14 +450,14 @@
>
> for (int i = 0; i < 4; i++)
> {
> - memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
> + memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
> }
>
> fastForwardDst(block, coef, shift_1st);
> fastForwardDst(coef, dst, shift_2nd);
> }
>
> -void dct4_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
> {
> const int shift_1st = 1 + X265_DEPTH - 8;
> const int shift_2nd = 8;
> @@ -467,14 +467,14 @@
>
> for (int i = 0; i < 4; i++)
> {
> - memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
> + memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
> }
>
> partialButterfly4(block, coef, shift_1st, 4);
> partialButterfly4(coef, dst, shift_2nd, 4);
> }
>
> -void dct8_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
> {
> const int shift_1st = 2 + X265_DEPTH - 8;
> const int shift_2nd = 9;
> @@ -484,14 +484,14 @@
>
> for (int i = 0; i < 8; i++)
> {
> - memcpy(&block[i * 8], &src[i * stride], 8 * sizeof(int16_t));
> + memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
> }
>
> partialButterfly8(block, coef, shift_1st, 8);
> partialButterfly8(coef, dst, shift_2nd, 8);
> }
>
> -void dct16_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
> {
> const int shift_1st = 3 + X265_DEPTH - 8;
> const int shift_2nd = 10;
> @@ -501,14 +501,14 @@
>
> for (int i = 0; i < 16; i++)
> {
> - memcpy(&block[i * 16], &src[i * stride], 16 * sizeof(int16_t));
> + memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
> }
>
> partialButterfly16(block, coef, shift_1st, 16);
> partialButterfly16(coef, dst, shift_2nd, 16);
> }
>
> -void dct32_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
> {
> const int shift_1st = 4 + X265_DEPTH - 8;
> const int shift_2nd = 11;
> @@ -518,14 +518,14 @@
>
> for (int i = 0; i < 32; i++)
> {
> - memcpy(&block[i * 32], &src[i * stride], 32 * sizeof(int16_t));
> + memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
> }
>
> partialButterfly32(block, coef, shift_1st, 32);
> partialButterfly32(coef, dst, shift_2nd, 32);
> }
>
> -void idst4_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
> {
> const int shift_1st = 7;
> const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -538,11 +538,11 @@
>
> for (int i = 0; i < 4; i++)
> {
> - memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t));
> + memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
> }
> }
>
> -void idct4_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
> {
> const int shift_1st = 7;
> const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -555,11 +555,11 @@
>
> for (int i = 0; i < 4; i++)
> {
> - memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t));
> + memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
> }
> }
>
> -void idct8_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
> {
> const int shift_1st = 7;
> const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -569,13 +569,14 @@
>
> partialButterflyInverse8(src, coef, shift_1st, 8);
> partialButterflyInverse8(coef, block, shift_2nd, 8);
> +
> for (int i = 0; i < 8; i++)
> {
> - memcpy(&dst[i * stride], &block[i * 8], 8 * sizeof(int16_t));
> + memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t));
> }
> }
>
> -void idct16_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
> {
> const int shift_1st = 7;
> const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -585,13 +586,14 @@
>
> partialButterflyInverse16(src, coef, shift_1st, 16);
> partialButterflyInverse16(coef, block, shift_2nd, 16);
> +
> for (int i = 0; i < 16; i++)
> {
> - memcpy(&dst[i * stride], &block[i * 16], 16 * sizeof(int16_t));
> + memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
> }
> }
>
> -void idct32_c(const int16_t *src, int16_t *dst, intptr_t stride)
> +void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
> {
> const int shift_1st = 7;
> const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -604,7 +606,7 @@
>
> for (int i = 0; i < 32; i++)
> {
> - memcpy(&dst[i * stride], &block[i * 32], 32 * sizeof(int16_t));
> + memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
> }
> }
>
> @@ -632,7 +634,7 @@
> }
> }
>
> -void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int16_t* coef, int num, int per, int shift)
> +void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
> {
> X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
>
> @@ -724,15 +726,15 @@
> }
>
> template<int trSize>
> -uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t stride)
> +uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
> {
> uint32_t numSig = 0;
> for (int k = 0; k < trSize; k++)
> {
> for (int j = 0; j < trSize; j++)
> {
> - coeff[k * trSize + j] = residual[k * stride + j];
> - numSig += (residual[k * stride + j] != 0);
> + coeff[k * trSize + j] = residual[k * resiStride + j];
> + numSig += (residual[k * resiStride + j] != 0);
> }
> }
>
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/pixel.cpp
> --- a/source/common/pixel.cpp Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/pixel.cpp Thu Nov 27 10:12:03 2014 +0900
> @@ -32,32 +32,32 @@
>
> using namespace x265;
>
> -#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
> - p.FUNC_PREFIX[LUMA_4x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_8x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_8x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_4x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_16x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_16x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_8x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_16x12] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_12x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_16x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_4x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_32x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_32x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_16x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_32x24] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_24x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_32x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_8x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_64x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_64x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_32x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_64x48] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_48x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_64x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
> - p.FUNC_PREFIX[LUMA_16x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
> +#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \
> + p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
> + p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
>
> #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
> p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \
> @@ -491,73 +491,73 @@
> }
> }
>
> -void copy16to16_shl(int16_t *dst, const int16_t *src, intptr_t stride, int shift, int size)
> +template<int size>
> +void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
> {
> - X265_CHECK(!(size & 3), "invalid size\n");
> + X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
> + X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
> + X265_CHECK(shift >= 0, "invalid shift\n");
> +
> for (int i = 0; i < size; i++)
> {
> for (int j = 0; j < size; j++)
> - {
> - dst[i * size + j] = src[i * stride + j] << shift;
> - }
> + dst[j] = src[j] << shift;
> +
> + src += srcStride;
> + dst += size;
> }
> }
>
> template<int size>
> -void convert16to32_shr(int32_t* dst, const int16_t* src, intptr_t stride, int shift, int offset)
> +void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
> {
> + X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
> + X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
> + X265_CHECK(shift > 0, "invalid shift\n");
> +
> + int16_t round = 1 << (shift - 1);
> for (int i = 0; i < size; i++)
> {
> for (int j = 0; j < size; j++)
> - {
> - dst[i * size + j] = ((int)src[i * stride + j] + offset) >> shift;
> - }
> + dst[j] = (src[j] + round) >> shift;
> +
> + src += srcStride;
> + dst += size;
> }
> }
>
> -void copy_shr(int16_t* dst, const int16_t* src, intptr_t stride, int shift, int size)
> +template<int size>
> +void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
> {
> - int round = 1 << (shift - 1);
> + X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
> + X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
> + X265_CHECK(shift >= 0, "invalid shift\n");
>
> for (int i = 0; i < size; i++)
> {
> for (int j = 0; j < size; j++)
> - {
> - dst[j] = (int16_t)((src[j] + round) >> shift);
> - }
> + dst[j] = src[j] << shift;
>
> src += size;
> - dst += stride;
> + dst += dstStride;
> }
> }
>
> template<int size>
> -void convert32to16_shl(int16_t* dst, const int32_t* src, intptr_t stride, int shift)
> +void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
> {
> + X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
> + X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
> + X265_CHECK(shift > 0, "invalid shift\n");
> +
> + int16_t round = 1 << (shift - 1);
> for (int i = 0; i < size; i++)
> {
> for (int j = 0; j < size; j++)
> - {
> - dst[j] = ((int16_t)src[j] << shift);
> - }
> + dst[j] = (src[j] + round) >> shift;
>
> src += size;
> - dst += stride;
> - }
> -}
> -
> -template<int size>
> -void copy_shl(int16_t* dst, const int16_t* src, intptr_t stride, int shift)
> -{
> - for (int i = 0; i < size; i++)
> - {
> - for (int j = 0; j < size; j++)
> - {
> - dst[j] = (src[j] << shift);
> - }
> -
> - src += size;
> - dst += stride;
> + dst += dstStride;
> }
> }
>
> @@ -1263,9 +1263,9 @@
> CHROMA_444(64, 16);
> CHROMA_444(16, 64);
>
> - SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, pixel, pixel)
> - SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, pixelcmp_sp_t, int16_t, pixel)
> - SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, pixelcmp_ss_t, int16_t, int16_t)
> + SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel)
> + SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel)
> + SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t)
>
> p.blockfill_s[BLOCK_4x4] = blockfil_s_c<4>;
> p.blockfill_s[BLOCK_8x8] = blockfil_s_c<8>;
> @@ -1273,21 +1273,22 @@
> p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
> p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
>
> - p.cpy16to16_shl = copy16to16_shl;
> - p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>;
> - p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>;
> - p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>;
> - p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>;
> - p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>;
> - p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>;
> - p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>;
> - p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>;
> -
> - p.copy_shr = copy_shr;
> - p.copy_shl[BLOCK_4x4] = copy_shl<4>;
> - p.copy_shl[BLOCK_8x8] = copy_shl<8>;
> - p.copy_shl[BLOCK_16x16] = copy_shl<16>;
> - p.copy_shl[BLOCK_32x32] = copy_shl<32>;
> + p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>;
> + p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>;
> + p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>;
> + p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>;
> + p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>;
> + p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>;
> + p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>;
> + p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>;
> + p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>;
> + p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>;
> + p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>;
> + p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>;
> + p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>;
> + p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>;
> + p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>;
> + p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>;
>
> p.sa8d[BLOCK_4x4] = satd_4x4;
> p.sa8d[BLOCK_8x8] = sa8d_8x8;
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/primitives.h
> --- a/source/common/primitives.h Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/primitives.h Thu Nov 27 10:12:03 2014 +0900
> @@ -138,32 +138,27 @@
> typedef int (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
> typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
> typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> -typedef void (*blockcpy_sp_t)(int bx, int by, int16_t* dst, intptr_t dstride, const pixel* src, intptr_t sstride); // dst is aligned
> -typedef void (*blockcpy_sc_t)(int bx, int by, int16_t* dst, intptr_t dstride, const uint8_t* src, intptr_t sstride); // dst is aligned
> -typedef void (*pixelsub_ps_t)(int bx, int by, int16_t* dst, intptr_t dstride, const pixel* src0, const pixel* src1, intptr_t sstride0, intptr_t sstride1);
> typedef void (*pixelavg_pp_t)(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int weight);
> typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
>
> typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel* refLeft, pixel* refAbove, int dirMode, int bFilter);
> typedef void (*intra_allangs_t)(pixel* dst, pixel* above0, pixel* left0, pixel* above1, pixel* left1, int bLuma);
>
> -typedef void (*cpy16to16_shl_t)(int16_t* dst, const int16_t* src, intptr_t, int, int);
> -typedef void (*cvt16to32_shl_t)(int32_t* dst, const int16_t* src, intptr_t, int, int);
> -typedef void (*cvt16to32_shr_t)(int32_t* dst, const int16_t* src, intptr_t, int, int);
> -typedef void (*cvt32to16_shl_t)(int16_t* dst, const int32_t* src, intptr_t, int);
> -typedef uint32_t (*copy_cnt_t)(int16_t* coeff, const int16_t* residual, intptr_t stride);
> -typedef void (*copy_shr_t)(int16_t* dst, const int16_t* src, intptr_t stride, int shift, int size);
> -typedef void (*copy_shl_t)(int16_t* dst, const int16_t* src, intptr_t stride, int shift);
> +typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +typedef void (*cpy1Dto2D_shl_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +typedef void (*cpy1Dto2D_shr_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +typedef uint32_t (*copy_cnt_t)(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
>
> -typedef void (*dct_t)(const int16_t* src, int16_t* dst, intptr_t stride);
> -typedef void (*idct_t)(const int16_t* src, int16_t* dst, intptr_t stride);
> +typedef void (*dct_t)(const int16_t* src, int16_t* dst, intptr_t srcStride);
> +typedef void (*idct_t)(const int16_t* src, int16_t* dst, intptr_t dstStride);
> typedef void (*denoiseDct_t)(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff);
>
> typedef void (*calcresidual_t)(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
> typedef void (*transpose_t)(pixel* dst, const pixel* src, intptr_t stride);
> -typedef uint32_t (*quant_t)(const int16_t *coef, const int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> -typedef uint32_t (*nquant_t)(const int16_t *coef, const int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> -typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t*vdequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
> +typedef uint32_t (*quant_t)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
> +typedef uint32_t (*nquant_t)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
> +typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
> typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
> typedef int (*count_nonzero_t)(const int16_t* quantCoeff, int numCoeff);
>
> @@ -186,7 +181,7 @@
> typedef void (*filter_hv_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
> typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
>
> -typedef void (*copy_pp_t)(pixel* dst, intptr_t dstride, const pixel* src, intptr_t sstride); // dst is aligned
> +typedef void (*copy_pp_t)(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); // dst is aligned
> typedef void (*copy_sp_t)(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> typedef void (*copy_ps_t)(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> typedef void (*copy_ss_t)(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> @@ -195,7 +190,7 @@
> typedef void (*pixel_add_ps_t)(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
> typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
>
> -typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
> +typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
> typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
> typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
>
> @@ -220,12 +215,11 @@
> pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS];
>
> blockfill_s_t blockfill_s[NUM_SQUARE_BLOCKS]; // block fill with value
> - cpy16to16_shl_t cpy16to16_shl;
> - cvt16to32_shr_t cvt16to32_shr[NUM_SQUARE_BLOCKS - 1];
> - cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1];
> + cpy2Dto1D_shl_t cpy2Dto1D_shl[NUM_SQUARE_BLOCKS - 1];
> + cpy2Dto1D_shr_t cpy2Dto1D_shr[NUM_SQUARE_BLOCKS - 1];
> + cpy1Dto2D_shl_t cpy1Dto2D_shl[NUM_SQUARE_BLOCKS - 1];
> + cpy1Dto2D_shr_t cpy1Dto2D_shr[NUM_SQUARE_BLOCKS - 1];
> copy_cnt_t copy_cnt[NUM_SQUARE_BLOCKS - 1];
> - copy_shr_t copy_shr;
> - copy_shl_t copy_shl[NUM_SQUARE_BLOCKS - 1];
>
> copy_pp_t luma_copy_pp[NUM_LUMA_PARTITIONS];
> copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS];
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/quant.cpp
> --- a/source/common/quant.cpp Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/quant.cpp Thu Nov 27 10:12:03 2014 +0900
> @@ -322,49 +322,46 @@
> return numSig;
> }
>
> -uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t stride,
> +uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride,
> coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
> {
> + const uint32_t sizeIdx = log2TrSize - 2;
> if (cu.m_tqBypass[absPartIdx])
> {
> X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
> - return primitives.copy_cnt[log2TrSize - 2](coeff, residual, stride);
> + return primitives.copy_cnt[sizeIdx](coeff, residual, resiStride);
> }
>
> bool isLuma = ttype == TEXT_LUMA;
> bool usePsy = m_psyRdoqScale && isLuma && !useTransformSkip;
> int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
> - int trSize = 1 << log2TrSize;
>
> X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n");
> if (useTransformSkip)
> {
> #if X265_DEPTH <= 10
> - primitives.cpy16to16_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
> + X265_CHECK(transformShift >= 0, "invalid transformShift\n");
> + primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
> #else
> if (transformShift >= 0)
> - primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
> + primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
> else
> - {
> - int shift = -transformShift;
> - int offset = (1 << (shift - 1));
> - primitives.cvt16to32_shr[log2TrSize - 2](m_resiDctCoeff, residual, stride, shift, offset);
> - }
> + primitives.cpy2Dto1D_shr[sizeIdx](m_resiDctCoeff, residual, resiStride, -transformShift);
> #endif
> }
> else
> {
> bool isIntra = cu.isIntra(absPartIdx);
> - const uint32_t sizeIdx = log2TrSize - 2;
> int useDST = !sizeIdx && isLuma && isIntra;
> int index = DCT_4x4 + sizeIdx - useDST;
>
> - primitives.dct[index](residual, m_resiDctCoeff, stride);
> + primitives.dct[index](residual, m_resiDctCoeff, resiStride);
>
> /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
> * there is no risk of performing this DCT unnecessarily */
> if (usePsy)
> {
> + int trSize = 1 << log2TrSize;
> /* perform DCT on source pixels for psy-rdoq */
> primitives.square_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
> primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize);
> @@ -408,12 +405,13 @@
> }
> }
>
> -void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, const coeff_t* coeff,
> +void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
> uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
> {
> + const uint32_t sizeIdx = log2TrSize - 2;
> if (transQuantBypass)
> {
> - primitives.copy_shl[log2TrSize - 2](residual, coeff, stride, 0);
> + primitives.cpy1Dto2D_shl[sizeIdx](residual, coeff, resiStride, 0);
> return;
> }
>
> @@ -427,7 +425,7 @@
> if (m_scalingList->m_bEnabled)
> {
> int scalingListType = (bIntra ? 0 : 3) + ttype;
> - const int32_t* dequantCoef = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
> + const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem];
> primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
> }
> else
> @@ -438,20 +436,18 @@
>
> if (useTransformSkip)
> {
> - int trSize = 1 << log2TrSize;
> -
> #if X265_DEPTH <= 10
> - primitives.copy_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
> + X265_CHECK(transformShift > 0, "invalid transformShift\n");
> + primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
> #else
> if (transformShift > 0)
> - primitives.copy_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
> + primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
> else
> - primitives.cvt32to16_shl[log2TrSize - 2](residual, m_resiDctCoeff, stride, -transformShift);
> + primitives.cpy1Dto2D_shl[sizeIdx](residual, m_resiDctCoeff, resiStride, -transformShift);
> #endif
> }
> else
> {
> - const uint32_t sizeIdx = log2TrSize - 2;
> int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
>
> X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n");
> @@ -459,17 +455,17 @@
> // DC only
> if (numSig == 1 && coeff[0] != 0 && !useDST)
> {
> - const int shift_1st = 7;
> + const int shift_1st = 7 - 6;
> const int add_1st = 1 << (shift_1st - 1);
> - const int shift_2nd = 12 - (X265_DEPTH - 8);
> + const int shift_2nd = 12 - (X265_DEPTH - 8) - 3;
> const int add_2nd = 1 << (shift_2nd - 1);
>
> - int dc_val = (((m_resiDctCoeff[0] * 64 + add_1st) >> shift_1st) * 64 + add_2nd) >> shift_2nd;
> - primitives.blockfill_s[sizeIdx](residual, stride, (int16_t)dc_val);
> + int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
> + primitives.blockfill_s[sizeIdx](residual, resiStride, (int16_t)dc_val);
> return;
> }
>
> - primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, stride);
> + primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, resiStride);
> }
> }
>
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/quant.h
> --- a/source/common/quant.h Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/quant.h Thu Nov 27 10:12:03 2014 +0900
> @@ -104,10 +104,10 @@
> /* CU setup */
> void setQPforQuant(const CUData& ctu);
>
> - uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencstride, const int16_t* residual, uint32_t stride, coeff_t* coeff,
> + uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
> uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip);
>
> - void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, const coeff_t* coeff,
> + void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
> uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
>
> /* static methods shared with entropy.cpp */
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/x86/asm-primitives.cpp Thu Nov 27 10:12:03 2014 +0900
> @@ -1336,10 +1336,22 @@
> p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
> p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
>
> - p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
> - p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
> - p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
> - p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
> + p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
> + p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
> + p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
> + p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
> + p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
> + p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
> + p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
> + p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
> + p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
> + p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
> + p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
> + p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
> + p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
> + p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
> + p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
> + p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
>
> CHROMA_PIXELSUB_PS(_sse2);
> CHROMA_PIXELSUB_PS_422(_sse2);
> @@ -1406,10 +1418,6 @@
> p.quant = x265_quant_sse4;
> p.nquant = x265_nquant_sse4;
> p.dequant_normal = x265_dequant_normal_sse4;
> - p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
> - p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
> - p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
> - p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
> p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4;
> p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4;
> p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4;
> @@ -1438,6 +1446,14 @@
> p.nquant = x265_nquant_avx2;
> p.dequant_normal = x265_dequant_normal_avx2;
> p.scale1D_128to64 = x265_scale1D_128to64_avx2;
> + p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
> + p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
> + p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
> + p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
> + p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
> + p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
> + p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
> + p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
> #if X86_64
> p.dct[DCT_8x8] = x265_dct8_avx2;
> p.dct[DCT_16x16] = x265_dct16_avx2;
> @@ -1548,11 +1564,23 @@
> p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
> SA8D_INTER_FROM_BLOCK(sse2);
>
> - p.cpy16to16_shl = x265_copy16to16_shl_sse2;
> - p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
> - p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
> - p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
> - p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
> + p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
> + p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
> + p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
> + p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
> + p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
> + p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
> + p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
> + p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
> + p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
> + p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
> + p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
> + p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
> + p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
> + p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
> + p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
> + p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
> +
> p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
> p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
> p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
> @@ -1568,10 +1596,6 @@
> p.idct[IDST_4x4] = x265_idst4_sse2;
>
> p.planecopy_sp = x265_downShift_16_sse2;
> - p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
> - p.copy_shl[BLOCK_8x8] = x265_copy_shl_8_sse2;
> - p.copy_shl[BLOCK_16x16] = x265_copy_shl_16_sse2;
> - p.copy_shl[BLOCK_32x32] = x265_copy_shl_32_sse2;
> }
> if (cpuMask & X265_CPU_SSSE3)
> {
> @@ -1615,10 +1639,6 @@
> LUMA_ADDAVG(_sse4);
> CHROMA_ADDAVG(_sse4);
> CHROMA_ADDAVG_422(_sse4);
> - p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
> - p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
> - p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
> - p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
>
> // TODO: check POPCNT flag!
> p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_sse4;
> @@ -1688,7 +1708,6 @@
> INTRA_ANG_SSE4(sse4);
>
> p.dct[DCT_8x8] = x265_dct8_sse4;
> - p.copy_shr = x265_copy_shr_sse4;
> // p.denoiseDct = x265_denoise_dct_sse4;
> }
> if (cpuMask & X265_CPU_AVX)
> @@ -1759,10 +1778,14 @@
> p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
> p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2;
>
> - p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
> - p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
> - p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
> - p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
> + p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
> + p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
> + p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
> + p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
> + p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
> + p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
> + p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
> + p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
>
> // p.denoiseDct = x265_denoise_dct_avx2;
> p.dct[DCT_4x4] = x265_dct4_avx2;
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/blockcopy8.asm
> --- a/source/common/x86/blockcopy8.asm Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/x86/blockcopy8.asm Thu Nov 27 10:12:03 2014 +0900
> @@ -41,7 +41,7 @@
> SECTION .text
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_2x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_2x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_pp_2x4, 4, 7, 0
> @@ -59,7 +59,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_2x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_2x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_pp_2x8, 4, 7, 0
> @@ -97,7 +97,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_2x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_2x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_pp_2x16, 4, 7, 0
> @@ -115,7 +115,7 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_4x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_pp_4x2, 4, 6, 0
> @@ -127,7 +127,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_4x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_pp_4x4, 4, 4, 4
> @@ -145,7 +145,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PP_W4_H8 2
> INIT_XMM sse2
> @@ -192,7 +192,7 @@
> BLOCKCOPY_PP_W4_H8 4, 32
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_pp_6x8, 4, 7, 8
> @@ -257,7 +257,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_6x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_6x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_pp_6x16, 4, 7, 2
> @@ -279,7 +279,7 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_8x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_pp_8x2, 4, 4, 2
> @@ -291,7 +291,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_8x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_8x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_pp_8x4, 4, 4, 4
> @@ -309,7 +309,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_8x6(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_pp_8x6, 4, 7, 6
> @@ -333,7 +333,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_8x12(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_pp_8x12, 4, 5, 2
> @@ -350,7 +350,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PP_W8_H8 2
> INIT_XMM sse2
> @@ -397,7 +397,7 @@
> BLOCKCOPY_PP_W8_H8 8, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PP_W12_H4 2
> INIT_XMM sse2
> @@ -439,7 +439,7 @@
> BLOCKCOPY_PP_W12_H4 12, 32
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_16x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PP_W16_H4 2
> INIT_XMM sse2
> @@ -471,7 +471,7 @@
> BLOCKCOPY_PP_W16_H4 16, 12
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PP_W16_H8 2
> INIT_XMM sse2
> @@ -519,7 +519,7 @@
> BLOCKCOPY_PP_W16_H8 16, 24
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PP_W24_H4 2
> INIT_XMM sse2
> @@ -560,7 +560,7 @@
> BLOCKCOPY_PP_W24_H4 24, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PP_W32_H4 2
> INIT_XMM sse2
> @@ -684,7 +684,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_32x24(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_32x24(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_YMM avx
> cglobal blockcopy_pp_32x24, 4, 7, 6
> @@ -722,7 +722,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PP_W32_H16_avx 2
> INIT_YMM avx
> @@ -788,7 +788,7 @@
> BLOCKCOPY_PP_W32_H16_avx 32, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PP_W48_H2 2
> INIT_XMM sse2
> @@ -836,7 +836,7 @@
> BLOCKCOPY_PP_W48_H2 48, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
> +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PP_W64_H4 2
> INIT_XMM sse2
> @@ -897,7 +897,7 @@
> BLOCKCOPY_PP_W64_H4 64, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal blockcopy_sp_2x4, 4, 5, 2
> @@ -926,7 +926,7 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_2x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal blockcopy_sp_2x8, 4, 5, 2
> @@ -974,11 +974,11 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SP_W2_H2 2
> INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 7, 2, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 7, 2, dst, dstStride, src, srcStride
> add r3, r3
> mov r6d, %2/2
> .loop:
> @@ -1003,10 +1003,10 @@
> BLOCKCOPY_SP_W2_H2 2, 16
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_4x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_4x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal blockcopy_sp_4x2, 4, 4, 2, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_4x2, 4, 4, 2, dst, dstStride, src, srcStride
>
> add r3, r3
>
> @@ -1022,10 +1022,10 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_4x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_4x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal blockcopy_sp_4x4, 4, 4, 4, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_4x4, 4, 4, 4, dst, dstStride, src, srcStride
>
> add r3, r3
>
> @@ -1049,10 +1049,10 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_4x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_4x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal blockcopy_sp_4x8, 4, 4, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_4x8, 4, 4, 8, dst, dstStride, src, srcStride
>
> add r3, r3
>
> @@ -1092,11 +1092,11 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SP_W4_H8 2
> INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
>
> mov r4d, %2/8
>
> @@ -1150,7 +1150,7 @@
> BLOCKCOPY_SP_W4_H8 4, 32
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_6x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal blockcopy_sp_6x8, 4, 4, 2
> @@ -1213,11 +1213,11 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SP_W6_H2 2
> INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 7, 4, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
> add r3, r3
> mov r6d, %2/2
> .loop:
> @@ -1247,10 +1247,10 @@
> BLOCKCOPY_SP_W6_H2 6, 16
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_8x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal blockcopy_sp_8x2, 4, 4, 2, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_8x2, 4, 4, 2, dst, dstStride, src, srcStride
>
> add r3, r3
>
> @@ -1265,10 +1265,10 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_8x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_8x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal blockcopy_sp_8x4, 4, 4, 4, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_8x4, 4, 4, 4, dst, dstStride, src, srcStride
>
> add r3, r3
>
> @@ -1290,10 +1290,10 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_8x6(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_8x6(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal blockcopy_sp_8x6, 4, 4, 6, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_8x6, 4, 4, 6, dst, dstStride, src, srcStride
>
> add r3, r3
>
> @@ -1322,10 +1322,10 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_8x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_8x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal blockcopy_sp_8x8, 4, 4, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_8x8, 4, 4, 8, dst, dstStride, src, srcStride
>
> add r3, r3
>
> @@ -1361,11 +1361,11 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SP_W8_H4 2
> INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 4, dst, dstStride, src, srcStride
> add r3, r3
> mov r4d, %2/4
> .loop:
> @@ -1391,11 +1391,11 @@
> BLOCKCOPY_SP_W8_H4 8, 12
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SP_W8_H8 2
> INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
>
> mov r4d, %2/8
>
> @@ -1446,11 +1446,11 @@
> BLOCKCOPY_SP_W8_H8 8, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SP_W12_H4 2
> INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
>
> mov r4d, %2/4
>
> @@ -1503,11 +1503,11 @@
> BLOCKCOPY_SP_W12_H4 12, 32
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SP_W16_H4 2
> INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
>
> mov r4d, %2/4
>
> @@ -1554,11 +1554,11 @@
> BLOCKCOPY_SP_W16_H4 16, 24
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SP_W24_H2 2
> INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
>
> mov r4d, %2/2
>
> @@ -1595,11 +1595,11 @@
> BLOCKCOPY_SP_W24_H2 24, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SP_W32_H2 2
> INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
>
> mov r4d, %2/2
>
> @@ -1643,11 +1643,11 @@
> BLOCKCOPY_SP_W32_H2 32, 48
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SP_W48_H2 2
> INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
>
> mov r4d, %2
>
> @@ -1681,11 +1681,11 @@
> BLOCKCOPY_SP_W48_H2 48, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
> +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SP_W64_H1 2
> INIT_XMM sse2
> -cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
> +cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
>
> mov r4d, %2
>
> @@ -1726,10 +1726,10 @@
> BLOCKCOPY_SP_W64_H1 64, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockfill_s_4x4(int16_t *dest, intptr_t destride, int16_t val)
> +; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal blockfill_s_4x4, 3, 3, 1, dest, destStride, val
> +cglobal blockfill_s_4x4, 3, 3, 1, dst, dstStride, val
>
> add r1, r1
>
> @@ -1745,10 +1745,10 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockfill_s_8x8(int16_t *dest, intptr_t destride, int16_t val)
> +; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal blockfill_s_8x8, 3, 3, 1, dest, destStride, val
> +cglobal blockfill_s_8x8, 3, 3, 1, dst, dstStride, val
>
> add r1, r1
>
> @@ -1774,11 +1774,11 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
> +; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
> ;-----------------------------------------------------------------------------
> %macro BLOCKFILL_S_W16_H8 2
> INIT_XMM sse2
> -cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
> +cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
>
> mov r3d, %2/8
>
> @@ -1855,11 +1855,11 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
> +; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
> ;-----------------------------------------------------------------------------
> %macro BLOCKFILL_S_W32_H4 2
> INIT_XMM sse2
> -cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
> +cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
>
> mov r3d, %2/4
>
> @@ -1983,10 +1983,10 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_2x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> -cglobal blockcopy_ps_2x4, 4, 4, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_2x4, 4, 4, 1, dst, dstStride, src, srcStride
>
> add r1, r1
>
> @@ -2013,10 +2013,10 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_2x8(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_2x8(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> -cglobal blockcopy_ps_2x8, 4, 4, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_2x8, 4, 4, 1, dst, dstStride, src, srcStride
>
> add r1, r1
>
> @@ -2065,10 +2065,10 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_2x16(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_2x16(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> -cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_2x16, 4, 5, 2, dst, dstStride, src, srcStride
> add r1, r1
> mov r4d, 16/2
> .loop:
> @@ -2086,10 +2086,10 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_4x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_4x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> -cglobal blockcopy_ps_4x2, 4, 4, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_4x2, 4, 4, 1, dst, dstStride, src, srcStride
>
> add r1, r1
>
> @@ -2105,10 +2105,10 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_4x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_4x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> -cglobal blockcopy_ps_4x4, 4, 4, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_4x4, 4, 4, 1, dst, dstStride, src, srcStride
>
> add r1, r1
>
> @@ -2135,11 +2135,11 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PS_W4_H4 2
> INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
>
> add r1, r1
> mov r4d, %2/4
> @@ -2180,11 +2180,11 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PS_W6_H4 2
> INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
>
> add r1, r1
> mov r4d, %2/4
> @@ -2227,10 +2227,10 @@
> BLOCKCOPY_PS_W6_H4 6, 16
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_8x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_8x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> -cglobal blockcopy_ps_8x2, 4, 4, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_8x2, 4, 4, 1, dst, dstStride, src, srcStride
>
> add r1, r1
>
> @@ -2245,10 +2245,10 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_8x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_8x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> -cglobal blockcopy_ps_8x4, 4, 4, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_8x4, 4, 4, 1, dst, dstStride, src, srcStride
>
> add r1, r1
>
> @@ -2274,10 +2274,10 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_8x6(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_8x6(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> -cglobal blockcopy_ps_8x6, 4, 4, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_8x6, 4, 4, 1, dst, dstStride, src, srcStride
>
> add r1, r1
>
> @@ -2314,11 +2314,11 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PS_W8_H4 2
> INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
>
> add r1, r1
> mov r4d, %2/4
> @@ -2361,11 +2361,11 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PS_W12_H2 2
> INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
>
> add r1, r1
> mov r4d, %2/2
> @@ -2398,10 +2398,10 @@
> BLOCKCOPY_PS_W12_H2 12, 32
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_16x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_16x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> -cglobal blockcopy_ps_16x4, 4, 4, 3, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_16x4, 4, 4, 3, dst, dstStride, src, srcStride
>
> add r1, r1
> pxor m0, m0
> @@ -2436,11 +2436,11 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PS_W16_H4 2
> INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
>
> add r1, r1
> mov r4d, %2/4
> @@ -2492,11 +2492,11 @@
> BLOCKCOPY_PS_W16_H4 16, 24
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PS_W24_H2 2
> INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
>
> add r1, r1
> mov r4d, %2/2
> @@ -2537,11 +2537,11 @@
> BLOCKCOPY_PS_W24_H2 24, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PS_W32_H2 2
> INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
>
> add r1, r1
> mov r4d, %2/2
> @@ -2590,11 +2590,11 @@
> BLOCKCOPY_PS_W32_H2 32, 48
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PS_W48_H2 2
> INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
>
> add r1, r1
> mov r4d, %2/2
> @@ -2649,11 +2649,11 @@
> BLOCKCOPY_PS_W48_H2 48, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
> +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_PS_W64_H2 2
> INIT_XMM sse4
> -cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
> +cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
>
> add r1, r1
> mov r4d, %2/2
> @@ -2723,7 +2723,7 @@
> BLOCKCOPY_PS_W64_H2 64, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_2x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_ss_2x4, 4, 6, 0
> @@ -2746,7 +2746,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_2x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_2x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_ss_2x8, 4, 6, 0
> @@ -2785,7 +2785,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_2x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_2x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_ss_2x16, 4, 7, 0
> @@ -2805,7 +2805,7 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_4x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_4x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_ss_4x2, 4, 4, 2
> @@ -2821,7 +2821,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_4x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_4x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_ss_4x4, 4, 4, 4
> @@ -2841,7 +2841,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SS_W4_H8 2
> INIT_XMM sse2
> @@ -2889,7 +2889,7 @@
> BLOCKCOPY_SS_W4_H8 4, 32
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_6x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_6x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_ss_6x8, 4, 4, 4
> @@ -2944,7 +2944,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_6x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_6x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_ss_6x16, 4, 5, 4
> @@ -2968,7 +2968,7 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_8x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_8x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_ss_8x2, 4, 4, 2
> @@ -2984,7 +2984,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_8x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_8x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_ss_8x4, 4, 4, 4
> @@ -3005,7 +3005,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_8x6(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_8x6(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_ss_8x6, 4, 4, 4
> @@ -3034,7 +3034,7 @@
> RET
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_8x12(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_8x12(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> cglobal blockcopy_ss_8x12, 4, 5, 2
> @@ -3054,7 +3054,7 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SS_W8_H8 2
> INIT_XMM sse2
> @@ -3105,7 +3105,7 @@
> BLOCKCOPY_SS_W8_H8 8, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SS_W12_H4 2
> INIT_XMM sse2
> @@ -3149,7 +3149,7 @@
> BLOCKCOPY_SS_W12_H4 12, 32
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_16x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_16x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SS_W16_H4 2
> INIT_XMM sse2
> @@ -3192,7 +3192,7 @@
> BLOCKCOPY_SS_W16_H4 16, 12
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SS_W16_H4_avx 2
> INIT_YMM avx
> @@ -3229,7 +3229,7 @@
> BLOCKCOPY_SS_W16_H4_avx 16, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SS_W16_H8 2
> INIT_XMM sse2
> @@ -3302,7 +3302,7 @@
> BLOCKCOPY_SS_W16_H8 16, 24
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SS_W24_H4 2
> INIT_XMM sse2
> @@ -3354,7 +3354,7 @@
> BLOCKCOPY_SS_W24_H4 24, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SS_W32_H4 2
> INIT_XMM sse2
> @@ -3422,7 +3422,7 @@
> BLOCKCOPY_SS_W32_H4 32, 48
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SS_W48_H2 2
> INIT_XMM sse2
> @@ -3500,11 +3500,11 @@
> BLOCKCOPY_SS_W48_H2 48, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SS_W64_H4 2
> INIT_XMM sse2
> -cglobal blockcopy_ss_%1x%2, 4, 5, 6, dest, deststride, src, srcstride
> +cglobal blockcopy_ss_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
> mov r4d, %2/4
> add r1, r1
> add r3, r3
> @@ -3606,11 +3606,11 @@
> BLOCKCOPY_SS_W64_H4 64, 64
>
> ;-----------------------------------------------------------------------------
> -; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
> +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
> ;-----------------------------------------------------------------------------
> %macro BLOCKCOPY_SS_W64_H4_avx 2
> INIT_YMM avx
> -cglobal blockcopy_ss_%1x%2, 4, 7, 4, dest, deststride, src, srcstride
> +cglobal blockcopy_ss_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
> mov r4d, %2/4
> add r1, r1
> add r3, r3
> @@ -3670,152 +3670,82 @@
> BLOCKCOPY_SS_W64_H4_avx 64, 64
>
> ;--------------------------------------------------------------------------------------
> -; void copy16to16_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
> +; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal copy16to16_shl, 5, 6, 2, dst, src, stride, shift, size
> -%define shift m1
> -
> - ; make shift
> - movd shift, r3d
> +cglobal cpy2Dto1D_shr_4, 3, 4, 4
> + add r2d, r2d
> + movd m0, r3m
> + pcmpeqw m1, m1
> + psllw m1, m0
> + psraw m1, 1
>
> ; register alloc
> ; r0 - dst
> ; r1 - src
> - ; r2 - stride
> - ; r4 - size
> -
> - sub r2d, r4d
> - add r2d, r2d
> - mov r5d, r4d
> - shr r4d, 2
> -.loop_row:
> - mov r3d, r4d
> -
> -.loop_col:
> - movh m0, [r1]
> - psllw m0, shift
> - movh [r0], m0
> -
> - add r1, 8
> - add r0, 8
> -
> - dec r3d
> - jnz .loop_col
> -
> - add r1, r2
> - dec r5d
> - jnz .loop_row
> + ; r2 - srcStride
> + ; m0 - shift
> + ; m1 - word [-round]
> +
> + ; Row 0-3
> + movh m2, [r1]
> + movhps m2, [r1 + r2]
> + lea r1, [r1 + r2 * 2]
> + movh m3, [r1]
> + movhps m3, [r1 + r2]
> + psubw m2, m1
> + psubw m3, m1
> + psraw m2, m0
> + psraw m3, m0
> + mova [r0 + 0 * mmsize], m2
> + mova [r0 + 1 * mmsize], m3
> RET
>
>
> ;--------------------------------------------------------------------------------------
> -; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
> +; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> ;--------------------------------------------------------------------------------------
> -INIT_XMM sse4
> -cglobal cvt16to32_shr_4, 3,3,3
> +INIT_XMM sse2
> +cglobal cpy2Dto1D_shr_8, 3, 5, 4
> add r2d, r2d
> movd m0, r3m
> - movd m1, r4m
> - pshufd m1, m1, 0
> + pcmpeqw m1, m1
> + psllw m1, m0
> + psraw m1, 1
> + mov r3d, 8/4
> + lea r4, [r2 * 3]
>
> ; register alloc
> ; r0 - dst
> ; r1 - src
> - ; r2 - stride
> - ; m0 - shift
> - ; m1 - dword [offset]
> -
> - ; Row 0
> - pmovsxwd m2, [r1]
> - paddd m2, m1
> - psrad m2, m0
> - movu [r0 + 0 * mmsize], m2
> -
> - ; Row 1
> - pmovsxwd m2, [r1 + r2]
> - paddd m2, m1
> - psrad m2, m0
> - movu [r0 + 1 * mmsize], m2
> -
> - ; Row 2
> - lea r1, [r1 + r2 * 2]
> - pmovsxwd m2, [r1]
> - paddd m2, m1
> - psrad m2, m0
> - movu [r0 + 2 * mmsize], m2
> -
> - ; Row 3
> - pmovsxwd m2, [r1 + r2]
> - paddd m2, m1
> - psrad m2, m0
> - movu [r0 + 3 * mmsize], m2
> - RET
> -
> -
> -;--------------------------------------------------------------------------------------
> -; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
> -;--------------------------------------------------------------------------------------
> -INIT_XMM sse4
> -cglobal cvt16to32_shr_8, 3,5,3
> - add r2d, r2d
> - movd m0, r3m
> - movd m1, r4m
> - pshufd m1, m1, 0
> - mov r3d, 8/4
> - lea r4, [r2 * 3]
> -
> - ; register alloc
> - ; r0 - dst
> - ; r1 - src
> - ; r2 - stride
> + ; r2 - srcStride
> ; r3 - loop counter
> ; r4 - stride * 3
> ; m0 - shift
> - ; m1 - dword [offset]
> + ; m1 - word [-round]
>
> .loop:
> - ; Row 0
> - pmovsxwd m2, [r1]
> - pmovsxwd m3, [r1 + mmsize/2]
> - paddd m2, m1
> - paddd m3, m1
> - psrad m2, m0
> - psrad m3, m0
> - movu [r0 + 0 * mmsize], m2
> - movu [r0 + 1 * mmsize], m3
> -
> - ; Row 1
> - pmovsxwd m2, [r1 + r2]
> - pmovsxwd m3, [r1 + r2 + mmsize/2]
> - paddd m2, m1
> - paddd m3, m1
> - psrad m2, m0
> - psrad m3, m0
> - movu [r0 + 2 * mmsize], m2
> - movu [r0 + 3 * mmsize], m3
> -
> - ; Row 2
> - pmovsxwd m2, [r1 + r2 * 2]
> - pmovsxwd m3, [r1 + r2 * 2 + mmsize/2]
> - paddd m2, m1
> - paddd m3, m1
> - psrad m2, m0
> - psrad m3, m0
> - movu [r0 + 4 * mmsize], m2
> - movu [r0 + 5 * mmsize], m3
> -
> - ; Row 3
> - pmovsxwd m2, [r1 + r4]
> - pmovsxwd m3, [r1 + r4 + mmsize/2]
> - paddd m2, m1
> - paddd m3, m1
> - psrad m2, m0
> - psrad m3, m0
> - movu [r0 + 6 * mmsize], m2
> - movu [r0 + 7 * mmsize], m3
> -
> - add r0, 8 * mmsize
> + ; Row 0-1
> + mova m2, [r1]
> + mova m3, [r1 + r2]
> + psubw m2, m1
> + psubw m3, m1
> + psraw m2, m0
> + psraw m3, m0
> + mova [r0 + 0 * mmsize], m2
> + mova [r0 + 1 * mmsize], m3
> +
> + ; Row 2-3
> + mova m2, [r1 + r2 * 2]
> + mova m3, [r1 + r4]
> + psubw m2, m1
> + psubw m3, m1
> + psraw m2, m0
> + psraw m3, m0
> + mova [r0 + 2 * mmsize], m2
> + mova [r0 + 3 * mmsize], m3
> +
> + add r0, 4 * mmsize
> lea r1, [r1 + r2 * 4]
> dec r3d
> jnz .loop
> @@ -3823,62 +3753,47 @@
>
>
> ;--------------------------------------------------------------------------------------
> -; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
> +; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> ;--------------------------------------------------------------------------------------
> -INIT_XMM sse4
> -cglobal cvt16to32_shr_16, 3,4,6
> +INIT_XMM sse2
> +cglobal cpy2Dto1D_shr_16, 3, 4, 4
> add r2d, r2d
> movd m0, r3m
> - movd m1, r4m
> - pshufd m1, m1, 0
> + pcmpeqw m1, m1
> + psllw m1, m0
> + psraw m1, 1
> mov r3d, 16/2
>
> ; register alloc
> ; r0 - dst
> ; r1 - src
> - ; r2 - stride
> + ; r2 - srcStride
> ; r3 - loop counter
> ; m0 - shift
> - ; m1 - dword [offset]
> + ; m1 - word [-round]
>
> .loop:
> ; Row 0
> - pmovsxwd m2, [r1 + 0 * mmsize/2]
> - pmovsxwd m3, [r1 + 1 * mmsize/2]
> - pmovsxwd m4, [r1 + 2 * mmsize/2]
> - pmovsxwd m5, [r1 + 3 * mmsize/2]
> - paddd m2, m1
> - paddd m3, m1
> - paddd m4, m1
> - paddd m5, m1
> - psrad m2, m0
> - psrad m3, m0
> - psrad m4, m0
> - psrad m5, m0
> - movu [r0 + 0 * mmsize], m2
> - movu [r0 + 1 * mmsize], m3
> - movu [r0 + 2 * mmsize], m4
> - movu [r0 + 3 * mmsize], m5
> + mova m2, [r1 + 0 * mmsize]
> + mova m3, [r1 + 1 * mmsize]
> + psubw m2, m1
> + psubw m3, m1
> + psraw m2, m0
> + psraw m3, m0
> + mova [r0 + 0 * mmsize], m2
> + mova [r0 + 1 * mmsize], m3
>
> ; Row 1
> - pmovsxwd m2, [r1 + r2 + 0 * mmsize/2]
> - pmovsxwd m3, [r1 + r2 +1 * mmsize/2]
> - pmovsxwd m4, [r1 + r2 +2 * mmsize/2]
> - pmovsxwd m5, [r1 + r2 +3 * mmsize/2]
> - paddd m2, m1
> - paddd m3, m1
> - paddd m4, m1
> - paddd m5, m1
> - psrad m2, m0
> - psrad m3, m0
> - psrad m4, m0
> - psrad m5, m0
> - movu [r0 + 4 * mmsize], m2
> - movu [r0 + 5 * mmsize], m3
> - movu [r0 + 6 * mmsize], m4
> - movu [r0 + 7 * mmsize], m5
> -
> - add r0, 8 * mmsize
> + mova m2, [r1 + r2 + 0 * mmsize]
> + mova m3, [r1 + r2 + 1 * mmsize]
> + psubw m2, m1
> + psubw m3, m1
> + psraw m2, m0
> + psraw m3, m0
> + mova [r0 + 2 * mmsize], m2
> + mova [r0 + 3 * mmsize], m3
> +
> + add r0, 4 * mmsize
> lea r1, [r1 + r2 * 2]
> dec r3d
> jnz .loop
> @@ -3886,61 +3801,45 @@
>
>
> ;--------------------------------------------------------------------------------------
> -; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
> +; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> ;--------------------------------------------------------------------------------------
> -INIT_XMM sse4
> -cglobal cvt16to32_shr_32, 3,4,6
> +INIT_XMM sse2
> +cglobal cpy2Dto1D_shr_32, 3, 4, 6
> add r2d, r2d
> movd m0, r3m
> - movd m1, r4m
> - pshufd m1, m1, 0
> + pcmpeqw m1, m1
> + psllw m1, m0
> + psraw m1, 1
> mov r3d, 32/1
>
> ; register alloc
> ; r0 - dst
> ; r1 - src
> - ; r2 - stride
> + ; r2 - srcStride
> ; r3 - loop counter
> ; m0 - shift
> - ; m1 - dword [offset]
> + ; m1 - word [-round]
>
> .loop:
> ; Row 0
> - pmovsxwd m2, [r1 + 0 * mmsize/2]
> - pmovsxwd m3, [r1 + 1 * mmsize/2]
> - pmovsxwd m4, [r1 + 2 * mmsize/2]
> - pmovsxwd m5, [r1 + 3 * mmsize/2]
> - paddd m2, m1
> - paddd m3, m1
> - paddd m4, m1
> - paddd m5, m1
> - psrad m2, m0
> - psrad m3, m0
> - psrad m4, m0
> - psrad m5, m0
> - movu [r0 + 0 * mmsize], m2
> - movu [r0 + 1 * mmsize], m3
> - movu [r0 + 2 * mmsize], m4
> - movu [r0 + 3 * mmsize], m5
> -
> - pmovsxwd m2, [r1 + 4 * mmsize/2]
> - pmovsxwd m3, [r1 + 5 * mmsize/2]
> - pmovsxwd m4, [r1 + 6 * mmsize/2]
> - pmovsxwd m5, [r1 + 7 * mmsize/2]
> - paddd m2, m1
> - paddd m3, m1
> - paddd m4, m1
> - paddd m5, m1
> - psrad m2, m0
> - psrad m3, m0
> - psrad m4, m0
> - psrad m5, m0
> - movu [r0 + 4 * mmsize], m2
> - movu [r0 + 5 * mmsize], m3
> - movu [r0 + 6 * mmsize], m4
> - movu [r0 + 7 * mmsize], m5
> -
> - add r0, 8 * mmsize
> + mova m2, [r1 + 0 * mmsize]
> + mova m3, [r1 + 1 * mmsize]
> + mova m4, [r1 + 2 * mmsize]
> + mova m5, [r1 + 3 * mmsize]
> + psubw m2, m1
> + psubw m3, m1
> + psubw m4, m1
> + psubw m5, m1
> + psraw m2, m0
> + psraw m3, m0
> + psraw m4, m0
> + psraw m5, m0
> + mova [r0 + 0 * mmsize], m2
> + mova [r0 + 1 * mmsize], m3
> + mova [r0 + 2 * mmsize], m4
> + mova [r0 + 3 * mmsize], m5
> +
> + add r0, 4 * mmsize
> add r1, r2
> dec r3d
> jnz .loop
> @@ -3948,58 +3847,239 @@
>
>
> ;--------------------------------------------------------------------------------------
> -; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
> +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal cvt32to16_shl_4, 3,3,5
> +cglobal cpy1Dto2D_shl_4, 3, 3, 3
> add r2d, r2d
> movd m0, r3m
>
> ; Row 0-3
> + mova m1, [r1 + 0 * mmsize]
> + mova m2, [r1 + 1 * mmsize]
> + psllw m1, m0
> + psllw m2, m0
> + movh [r0], m1
> + movhps [r0 + r2], m1
> + movh [r0 + r2 * 2], m2
> + lea r2, [r2 * 3]
> + movhps [r0 + r2], m2
> + RET
> +
> +
> +INIT_YMM avx2
> +cglobal cpy1Dto2D_shl_4, 3, 3, 2
> + add r2d, r2d
> + movd xm0, r3m
> +
> + ; Row 0-3
> + movu m1, [r1]
> + psllw m1, xm0
> + vextracti128 xm0, m1, 1
> + movq [r0], xm1
> + movhps [r0 + r2], xm1
> + lea r0, [r0 + r2 * 2]
> + movq [r0], xm0
> + movhps [r0 + r2], xm0
> + RET
> +
> +
> +;--------------------------------------------------------------------------------------
> +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
> +;--------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cpy1Dto2D_shl_8, 3, 4, 5
> + add r2d, r2d
> + movd m0, r3m
> + lea r3, [r2 * 3]
> +
> + ; Row 0-3
> + mova m1, [r1 + 0 * mmsize]
> + mova m2, [r1 + 1 * mmsize]
> + mova m3, [r1 + 2 * mmsize]
> + mova m4, [r1 + 3 * mmsize]
> + psllw m1, m0
> + psllw m2, m0
> + psllw m3, m0
> + psllw m4, m0
> + mova [r0], m1
> + mova [r0 + r2], m2
> + mova [r0 + r2 * 2], m3
> + mova [r0 + r3], m4
> + lea r0, [r0 + r2 * 4]
> +
> + ; Row 4-7
> + mova m1, [r1 + 4 * mmsize]
> + mova m2, [r1 + 5 * mmsize]
> + mova m3, [r1 + 6 * mmsize]
> + mova m4, [r1 + 7 * mmsize]
> + psllw m1, m0
> + psllw m2, m0
> + psllw m3, m0
> + psllw m4, m0
> + mova [r0], m1
> + mova [r0 + r2], m2
> + mova [r0 + r2 * 2], m3
> + mova [r0 + r3], m4
> + RET
> +
> +
> +INIT_YMM avx2
> +cglobal cpy1Dto2D_shl_8, 3, 4, 3
> + add r2d, r2d
> + movd xm0, r3m
> + lea r3, [r2 * 3]
> +
> + ; Row 0-3
> movu m1, [r1 + 0 * mmsize]
> movu m2, [r1 + 1 * mmsize]
> - movu m3, [r1 + 2 * mmsize]
> - movu m4, [r1 + 3 * mmsize]
> - packssdw m1, m2
> - packssdw m3, m4
> + psllw m1, xm0
> + psllw m2, xm0
> + movu [r0], xm1
> + vextracti128 [r0 + r2], m1, 1
> + movu [r0 + r2 * 2], xm2
> + vextracti128 [r0 + r3], m2, 1
> +
> + ; Row 4-7
> + movu m1, [r1 + 2 * mmsize]
> + movu m2, [r1 + 3 * mmsize]
> + lea r0, [r0 + r2 * 4]
> + psllw m1, xm0
> + psllw m2, xm0
> + movu [r0], xm1
> + vextracti128 [r0 + r2], m1, 1
> + movu [r0 + r2 * 2], xm2
> + vextracti128 [r0 + r3], m2, 1
> + RET
> +
> +
> +;--------------------------------------------------------------------------------------
> +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
> +;--------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cpy1Dto2D_shl_16, 3, 4, 5
> + add r2d, r2d
> + movd m0, r3m
> + mov r3d, 16/4
> +
> +.loop:
> + ; Row 0-1
> + mova m1, [r1 + 0 * mmsize]
> + mova m2, [r1 + 1 * mmsize]
> + mova m3, [r1 + 2 * mmsize]
> + mova m4, [r1 + 3 * mmsize]
> psllw m1, m0
> + psllw m2, m0
> psllw m3, m0
> - movh [r0], m1
> - movhps [r0 + r2], m1
> - movh [r0 + r2 * 2], m3
> - lea r2, [r2 * 3]
> - movhps [r0 + r2], m3
> + psllw m4, m0
> + mova [r0], m1
> + mova [r0 + 16], m2
> + mova [r0 + r2], m3
> + mova [r0 + r2 + 16], m4
> +
> + ; Row 2-3
> + mova m1, [r1 + 4 * mmsize]
> + mova m2, [r1 + 5 * mmsize]
> + mova m3, [r1 + 6 * mmsize]
> + mova m4, [r1 + 7 * mmsize]
> + lea r0, [r0 + r2 * 2]
> + psllw m1, m0
> + psllw m2, m0
> + psllw m3, m0
> + psllw m4, m0
> + mova [r0], m1
> + mova [r0 + 16], m2
> + mova [r0 + r2], m3
> + mova [r0 + r2 + 16], m4
> +
> + add r1, 8 * mmsize
> + lea r0, [r0 + r2 * 2]
> + dec r3d
> + jnz .loop
> RET
>
>
> INIT_YMM avx2
> -cglobal cvt32to16_shl_4, 3,3,3
> +cglobal cpy1Dto2D_shl_16, 3, 5, 3
> add r2d, r2d
> movd xm0, r3m
> -
> - ; Row 0-3
> + mov r3d, 16/4
> + lea r4, [r2 * 3]
> +
> +.loop:
> + ; Row 0-1
> movu m1, [r1 + 0 * mmsize]
> movu m2, [r1 + 1 * mmsize]
> - packssdw m1, m2
> psllw m1, xm0
> - vextracti128 xm0, m1, 1
> - movq [r0], xm1
> - movq [r0 + r2], xm0
> - lea r0, [r0 + r2 * 2]
> - movhps [r0], xm1
> - movhps [r0 + r2], xm0
> + psllw m2, xm0
> + movu [r0], m1
> + movu [r0 + r2], m2
> +
> + ; Row 2-3
> + movu m1, [r1 + 2 * mmsize]
> + movu m2, [r1 + 3 * mmsize]
> + psllw m1, xm0
> + psllw m2, xm0
> + movu [r0 + r2 * 2], m1
> + movu [r0 + r4], m2
> +
> + add r1, 4 * mmsize
> + lea r0, [r0 + r2 * 4]
> + dec r3d
> + jnz .loop
> RET
>
>
> ;--------------------------------------------------------------------------------------
> -; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
> +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal cvt32to16_shl_8, 3,5,5
> +cglobal cpy1Dto2D_shl_32, 3, 4, 5
> add r2d, r2d
> movd m0, r3m
> - mov r3d, 8/4
> - lea r4, [r2 * 3]
> + mov r3d, 32/2
> +
> +.loop:
> + ; Row 0
> + mova m1, [r1 + 0 * mmsize]
> + mova m2, [r1 + 1 * mmsize]
> + mova m3, [r1 + 2 * mmsize]
> + mova m4, [r1 + 3 * mmsize]
> + psllw m1, m0
> + psllw m2, m0
> + psllw m3, m0
> + psllw m4, m0
> + mova [r0 + 0 * mmsize], m1
> + mova [r0 + 1 * mmsize], m2
> + mova [r0 + 2 * mmsize], m3
> + mova [r0 + 3 * mmsize], m4
> +
> + ; Row 1
> + mova m1, [r1 + 4 * mmsize]
> + mova m2, [r1 + 5 * mmsize]
> + mova m3, [r1 + 6 * mmsize]
> + mova m4, [r1 + 7 * mmsize]
> + psllw m1, m0
> + psllw m2, m0
> + psllw m3, m0
> + psllw m4, m0
> + mova [r0 + r2 + 0 * mmsize], m1
> + mova [r0 + r2 + 1 * mmsize], m2
> + mova [r0 + r2 + 2 * mmsize], m3
> + mova [r0 + r2 + 3 * mmsize], m4
> +
> + add r1, 8 * mmsize
> + lea r0, [r0 + r2 * 2]
> + dec r3d
> + jnz .loop
> + RET
> +
> +
> +INIT_YMM avx2
> +cglobal cpy1Dto2D_shl_32, 3, 4, 5
> + add r2d, r2d
> + movd xm0, r3m
> + mov r3d, 32/2
>
> .loop:
> ; Row 0-1
> @@ -4007,252 +4087,14 @@
> movu m2, [r1 + 1 * mmsize]
> movu m3, [r1 + 2 * mmsize]
> movu m4, [r1 + 3 * mmsize]
> - packssdw m1, m2
> - packssdw m3, m4
> - psllw m1, m0
> - psllw m3, m0
> + psllw m1, xm0
> + psllw m2, xm0
> + psllw m3, xm0
> + psllw m4, xm0
> movu [r0], m1
> + movu [r0 + mmsize], m2
> movu [r0 + r2], m3
> -
> - ; Row 2-3
> - movu m1, [r1 + 4 * mmsize]
> - movu m2, [r1 + 5 * mmsize]
> - movu m3, [r1 + 6 * mmsize]
> - movu m4, [r1 + 7 * mmsize]
> - packssdw m1, m2
> - packssdw m3, m4
> - psllw m1, m0
> - psllw m3, m0
> - movu [r0 + r2 * 2], m1
> - movu [r0 + r4], m3
> -
> - add r1, 8 * mmsize
> - lea r0, [r0 + r2 * 4]
> - dec r3d
> - jnz .loop
> - RET
> -
> -
> -INIT_YMM avx2
> -cglobal cvt32to16_shl_8, 3,4,3
> - add r2d, r2d
> - movd xm0, r3m
> - lea r3, [r2 * 3]
> -
> - ; Row 0-1
> - movu xm1, [r1 + 0 * mmsize]
> - vinserti128 m1, m1, [r1 + 1 * mmsize], 1
> - movu xm2, [r1 + 0 * mmsize + mmsize/2]
> - vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
> - packssdw m1, m2
> - psllw m1, xm0
> - movu [r0], xm1
> - vextracti128 [r0 + r2], m1, 1
> -
> - ; Row 2-3
> - movu xm1, [r1 + 2 * mmsize]
> - vinserti128 m1, m1, [r1 + 3 * mmsize], 1
> - movu xm2, [r1 + 2 * mmsize + mmsize/2]
> - vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
> - packssdw m1, m2
> - psllw m1, xm0
> - movu [r0 + r2 * 2], xm1
> - vextracti128 [r0 + r3], m1, 1
> -
> - add r1, 4 * mmsize
> - lea r0, [r0 + r2 * 4]
> -
> - ; Row 4-5
> - movu m1, [r1 + 0 * mmsize]
> - movu m2, [r1 + 1 * mmsize]
> - packssdw m1, m2
> - vpermq m1, m1, 11011000b
> - psllw m1, xm0
> - movu [r0], xm1
> - vextracti128 [r0 + r2], m1, 1
> -
> - ; Row 6-7
> - movu m1, [r1 + 2 * mmsize]
> - movu m2, [r1 + 3 * mmsize]
> - packssdw m1, m2
> - vpermq m1, m1, 11011000b
> - psllw m1, xm0
> - movu [r0 + r2 * 2], xm1
> - vextracti128 [r0 + r3], m1, 1
> - RET
> -
> -;--------------------------------------------------------------------------------------
> -; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
> -;--------------------------------------------------------------------------------------
> -INIT_XMM sse2
> -cglobal cvt32to16_shl_16, 3,4,5
> - add r2d, r2d
> - movd m0, r3m
> - mov r3d, 16/2
> -
> -.loop:
> - ; Row 0
> - movu m1, [r1 + 0 * mmsize]
> - movu m2, [r1 + 1 * mmsize]
> - movu m3, [r1 + 2 * mmsize]
> - movu m4, [r1 + 3 * mmsize]
> - packssdw m1, m2
> - packssdw m3, m4
> - psllw m1, m0
> - psllw m3, m0
> - movu [r0], m1
> - movu [r0 + mmsize], m3
> -
> - ; Row 1
> - movu m1, [r1 + 4 * mmsize]
> - movu m2, [r1 + 5 * mmsize]
> - movu m3, [r1 + 6 * mmsize]
> - movu m4, [r1 + 7 * mmsize]
> - packssdw m1, m2
> - packssdw m3, m4
> - psllw m1, m0
> - psllw m3, m0
> - movu [r0 + r2], m1
> - movu [r0 + r2 + mmsize], m3
> -
> - add r1, 8 * mmsize
> - lea r0, [r0 + r2 * 2]
> - dec r3d
> - jnz .loop
> - RET
> -
> -
> -INIT_YMM avx2
> -cglobal cvt32to16_shl_16, 3,5,3
> - add r2d, r2d
> - movd xm0, r3m
> - mov r3d, 16/4
> - lea r4, [r2 * 3]
> -
> -.loop:
> - ; Row 0
> - movu xm1, [r1 + 0 * mmsize]
> - vinserti128 m1, m1, [r1 + 1 * mmsize], 1
> - movu xm2, [r1 + 0 * mmsize + mmsize/2]
> - vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
> - packssdw m1, m2
> - psllw m1, xm0
> - movu [r0], m1
> -
> - ; Row 1
> - movu xm1, [r1 + 2 * mmsize]
> - vinserti128 m1, m1, [r1 + 3 * mmsize], 1
> - movu xm2, [r1 + 2 * mmsize + mmsize/2]
> - vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
> - packssdw m1, m2
> - psllw m1, xm0
> - movu [r0 + r2], m1
> -
> - add r1, 4 * mmsize
> -
> - ; Row 2
> - movu xm1, [r1 + 0 * mmsize]
> - vinserti128 m1, m1, [r1 + 1 * mmsize], 1
> - movu xm2, [r1 + 0 * mmsize + mmsize/2]
> - vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
> - packssdw m1, m2
> - psllw m1, xm0
> - movu [r0 + r2 * 2], m1
> -
> - ; Row 3
> - movu m1, [r1 + 2 * mmsize]
> - movu m2, [r1 + 3 * mmsize]
> - packssdw m1, m2
> - psllw m1, xm0
> - vpermq m1, m1, 11011000b
> - movu [r0 + r4], m1
> -
> - add r1, 4 * mmsize
> - lea r0, [r0 + r2 * 4]
> - dec r3d
> - jnz .loop
> - RET
> -
> -
> -;--------------------------------------------------------------------------------------
> -; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
> -;--------------------------------------------------------------------------------------
> -INIT_XMM sse2
> -cglobal cvt32to16_shl_32, 3,4,5
> - add r2d, r2d
> - movd m0, r3m
> - mov r3d, 32/1
> -
> -.loop:
> - ; Row 0
> - movu m1, [r1 + 0 * mmsize]
> - movu m2, [r1 + 1 * mmsize]
> - movu m3, [r1 + 2 * mmsize]
> - movu m4, [r1 + 3 * mmsize]
> - packssdw m1, m2
> - packssdw m3, m4
> - psllw m1, m0
> - psllw m3, m0
> - movu [r0 + 0 * mmsize], m1
> - movu [r0 + 1 * mmsize], m3
> -
> - movu m1, [r1 + 4 * mmsize]
> - movu m2, [r1 + 5 * mmsize]
> - movu m3, [r1 + 6 * mmsize]
> - movu m4, [r1 + 7 * mmsize]
> - packssdw m1, m2
> - packssdw m3, m4
> - psllw m1, m0
> - psllw m3, m0
> - movu [r0 + 2 * mmsize], m1
> - movu [r0 + 3 * mmsize], m3
> -
> - add r1, 8 * mmsize
> - add r0, r2
> - dec r3d
> - jnz .loop
> - RET
> -
> -
> -INIT_YMM avx2
> -cglobal cvt32to16_shl_32, 3,4,5
> - add r2d, r2d
> - movd xm0, r3m
> - mov r3d, 32/2
> -
> -.loop:
> - ; Row 0
> - movu xm1, [r1 + 0 * mmsize]
> - vinserti128 m1, m1, [r1 + 1 * mmsize], 1
> - movu xm2, [r1 + 0 * mmsize + mmsize/2]
> - vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
> - movu xm3, [r1 + 2 * mmsize]
> - vinserti128 m3, m3, [r1 + 3 * mmsize], 1
> - movu xm4, [r1 + 2 * mmsize + mmsize/2]
> - vinserti128 m4, m4, [r1 + 3 * mmsize + mmsize/2], 1
> - packssdw m1, m2
> - packssdw m3, m4
> - psllw m1, xm0
> - psllw m3, xm0
> - movu [r0], m1
> - movu [r0 + mmsize], m3
> -
> - add r1, 4 * mmsize
> -
> - ; Row 1
> - movu xm1, [r1 + 0 * mmsize]
> - vinserti128 m1, m1, [r1 + 1 * mmsize], 1
> - movu xm2, [r1 + 0 * mmsize + mmsize/2]
> - vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
> - movu m3, [r1 + 2 * mmsize]
> - movu m4, [r1 + 3 * mmsize]
> - packssdw m1, m2
> - packssdw m3, m4
> - psllw m1, xm0
> - psllw m3, xm0
> - vpermq m3, m3, 11011000b
> - movu [r0 + r2], m1
> - movu [r0 + r2 + mmsize], m3
> + movu [r0 + r2 + mmsize], m4
>
> add r1, 4 * mmsize
> lea r0, [r0 + r2 * 2]
> @@ -4262,7 +4104,7 @@
>
>
> ;--------------------------------------------------------------------------------------
> -; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
> +; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal copy_cnt_4, 3,3,3
> @@ -4301,7 +4143,7 @@
>
>
> ;--------------------------------------------------------------------------------------
> -; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
> +; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal copy_cnt_8, 3,3,6
> @@ -4405,7 +4247,7 @@
>
>
> ;--------------------------------------------------------------------------------------
> -; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
> +; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal copy_cnt_16, 3,4,6
> @@ -4516,7 +4358,7 @@
> RET
>
> ;--------------------------------------------------------------------------------------
> -; uint32_t copy_cnt(int32_t *dst, int16_t *src, intptr_t stride);
> +; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal copy_cnt_32, 3,4,6
> @@ -4623,180 +4465,432 @@
> movd eax, xm4
> RET
>
> -;-----------------------------------------------------------------------------
> -; void copy_shr(short *dst, short *src, intptr_t stride, int shift, int size)
> -;-----------------------------------------------------------------------------
> -
> -INIT_XMM sse4
> -cglobal copy_shr, 4, 7, 4, dst, src, stride
> -%define rnd m2
> -%define shift m1
> -
> - ; make shift
> - mov r5d, r3m
> - movd shift, r5d
> -
> - ; make round
> - dec r5
> - xor r6, r6
> - bts r6, r5
> -
> - movd rnd, r6d
> - pshufd rnd, rnd, 0
> +
> +;--------------------------------------------------------------------------------------
> +; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +;--------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cpy2Dto1D_shl_4, 4, 4, 4
> + add r2d, r2d
> + movd m0, r3d
>
> ; register alloc
> ; r0 - dst
> ; r1 - src
> - ; r2 - stride * 2 (short*)
> - ; r3 - lx
> - ; r4 - size
> - ; r5 - ly
> - ; r6 - diff
> - add r2d, r2d
> -
> - mov r4d, r4m
> - mov r5, r4 ; size
> - mov r6, r2 ; stride
> - sub r6, r4
> - add r6, r6
> -
> - shr r5, 1
> -.loop_row:
> -
> - mov r3, r4
> - shr r3, 2
> -.loop_col:
> - ; row 0
> - movh m3, [r1]
> - pmovsxwd m0, m3
> - paddd m0, rnd
> - psrad m0, shift
> - packssdw m0, m0
> - movh [r0], m0
> -
> - ; row 1
> - movh m3, [r1 + r4 * 2]
> - pmovsxwd m0, m3
> - paddd m0, rnd
> - psrad m0, shift
> - packssdw m0, m0
> - movh [r0 + r2], m0
> -
> - ; move col pointer
> - add r1, 8
> - add r0, 8
> -
> - dec r3
> - jg .loop_col
> -
> - ; update pointer
> - lea r1, [r1 + r4 * 2]
> - add r0, r6
> -
> - ; end of loop_row
> - dec r5
> - jg .loop_row
> + ; r2 - srcStride
> + ; m0 - shift
> +
> + ; Row 0-3
> + movh m2, [r1]
> + movhps m2, [r1 + r2]
> + lea r1, [r1 + r2 * 2]
> + movh m3, [r1]
> + movhps m3, [r1 + r2]
> + psllw m2, m0
> + psllw m3, m0
> + mova [r0 + 0 * mmsize], m2
> + mova [r0 + 1 * mmsize], m3
>
> RET
>
> +
> ;--------------------------------------------------------------------------------------
> -; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
> +; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal copy_shl_4, 3,3,3
> +cglobal cpy2Dto1D_shl_8, 4, 5, 4
> + add r2d, r2d
> + movd m0, r3d
> + mov r3d, 8/4
> + lea r4, [r2 * 3]
> +
> + ; register alloc
> + ; r0 - dst
> + ; r1 - src
> + ; r2 - srcStride
> + ; r3 - loop counter
> + ; r4 - stride * 3
> + ; m0 - shift
> +
> +.loop:
> + ; Row 0, 1
> + mova m2, [r1]
> + mova m3, [r1 + r2]
> + psllw m2, m0
> + psllw m3, m0
> + mova [r0 + 0 * mmsize], m2
> + mova [r0 + 1 * mmsize], m3
> +
> + ; Row 2, 3
> + mova m2, [r1 + r2 * 2]
> + mova m3, [r1 + r4]
> + psllw m2, m0
> + psllw m3, m0
> + mova [r0 + 2 * mmsize], m2
> + mova [r0 + 3 * mmsize], m3
> +
> + add r0, 4 * mmsize
> + lea r1, [r1 + r2 * 4]
> + dec r3d
> + jnz .loop
> + RET
> +
> +
> +;--------------------------------------------------------------------------------------
> +; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +;--------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cpy2Dto1D_shl_16, 4, 4, 4
> + add r2d, r2d
> + movd m0, r3d
> + mov r3d, 16/2
> +
> + ; register alloc
> + ; r0 - dst
> + ; r1 - src
> + ; r2 - srcStride
> + ; r3 - loop counter
> + ; m0 - shift
> +
> +.loop:
> + ; Row 0
> + mova m2, [r1 + 0 * mmsize]
> + mova m3, [r1 + 1 * mmsize]
> + psllw m2, m0
> + psllw m3, m0
> + mova [r0 + 0 * mmsize], m2
> + mova [r0 + 1 * mmsize], m3
> +
> + ; Row 1
> + mova m2, [r1 + r2 + 0 * mmsize]
> + mova m3, [r1 + r2 + 1 * mmsize]
> + psllw m2, m0
> + psllw m3, m0
> + mova [r0 + 2 * mmsize], m2
> + mova [r0 + 3 * mmsize], m3
> +
> + add r0, 4 * mmsize
> + lea r1, [r1 + r2 * 2]
> + dec r3d
> + jnz .loop
> + RET
> +
> +
> +;--------------------------------------------------------------------------------------
> +; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +;--------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cpy2Dto1D_shl_32, 4, 4, 6
> + add r2d, r2d
> + movd m0, r3d
> + mov r3d, 32/1
> +
> + ; register alloc
> + ; r0 - dst
> + ; r1 - src
> + ; r2 - srcStride
> + ; r3 - loop counter
> + ; m0 - shift
> +
> +.loop:
> + ; Row 0
> + mova m2, [r1 + 0 * mmsize]
> + mova m3, [r1 + 1 * mmsize]
> + mova m4, [r1 + 2 * mmsize]
> + mova m5, [r1 + 3 * mmsize]
> + psllw m2, m0
> + psllw m3, m0
> + psllw m4, m0
> + psllw m5, m0
> + mova [r0 + 0 * mmsize], m2
> + mova [r0 + 1 * mmsize], m3
> + mova [r0 + 2 * mmsize], m4
> + mova [r0 + 3 * mmsize], m5
> +
> + add r0, 4 * mmsize
> + add r1, r2
> + dec r3d
> + jnz .loop
> + RET
> +
> +
> +;--------------------------------------------------------------------------------------
> +; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
> +;--------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cpy1Dto2D_shr_4, 3, 3, 4
> add r2d, r2d
> movd m0, r3m
> + pcmpeqw m1, m1
> + psllw m1, m0
> + psraw m1, 1
>
> ; Row 0-3
> - movu m1, [r1 + 0 * mmsize]
> - movu m2, [r1 + 1 * mmsize]
> - psllw m1, m0
> - psllw m2, m0
> - movh [r0], m1
> - movhps [r0 + r2], m1
> - movh [r0 + r2 * 2], m2
> + mova m2, [r1 + 0 * mmsize]
> + mova m3, [r1 + 1 * mmsize]
> + psubw m2, m1
> + psubw m3, m1
> + psraw m2, m0
> + psraw m3, m0
> + movh [r0], m2
> + movhps [r0 + r2], m2
> + movh [r0 + r2 * 2], m3
> lea r2, [r2 * 3]
> - movhps [r0 + r2], m2
> + movhps [r0 + r2], m3
> RET
>
> +
> +INIT_YMM avx2
> +cglobal cpy1Dto2D_shr_4, 3, 3, 3
> + add r2d, r2d
> + movd xm0, r3m
> + pcmpeqw m1, m1
> + psllw m1, xm0
> + psraw m1, 1
> +
> + ; Row 0-3
> + movu m2, [r1]
> + psubw m2, m1
> + psraw m2, xm0
> + vextracti128 xm1, m2, 1
> + movq [r0], xm2
> + movhps [r0 + r2], xm2
> + lea r0, [r0 + r2 * 2]
> + movq [r0], xm1
> + movhps [r0 + r2], xm1
> + RET
> +
> +
> ;--------------------------------------------------------------------------------------
> -; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
> +; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal copy_shl_8, 3,4,5
> +cglobal cpy1Dto2D_shr_8, 3, 4, 6
> add r2d, r2d
> movd m0, r3m
> + pcmpeqw m1, m1
> + psllw m1, m0
> + psraw m1, 1
> + lea r3, [r2 * 3]
>
> ; Row 0-3
> - movu m1, [r1 + 0 * mmsize]
> - movu m2, [r1 + 1 * mmsize]
> - movu m3, [r1 + 2 * mmsize]
> - movu m4, [r1 + 3 * mmsize]
> - psllw m1, m0
> - psllw m2, m0
> - psllw m3, m0
> - psllw m4, m0
> - movu [r0], m1
> - movu [r0 + r2], m2
> - movu [r0 + 2 * r2], m3
> - lea r0, [r0 + 2 * r2]
> - movu [r0 + r2], m4
> + mova m2, [r1 + 0 * mmsize]
> + mova m3, [r1 + 1 * mmsize]
> + mova m4, [r1 + 2 * mmsize]
> + mova m5, [r1 + 3 * mmsize]
> + psubw m2, m1
> + psubw m3, m1
> + psubw m4, m1
> + psubw m5, m1
> + psraw m2, m0
> + psraw m3, m0
> + psraw m4, m0
> + psraw m5, m0
> + mova [r0], m2
> + mova [r0 + r2], m3
> + mova [r0 + r2 * 2], m4
> + mova [r0 + r3], m5
>
> ; Row 4-7
> - movu m1, [r1 + 4 * mmsize]
> - movu m2, [r1 + 5 * mmsize]
> - movu m3, [r1 + 6 * mmsize]
> - movu m4, [r1 + 7 * mmsize]
> - psllw m1, m0
> - psllw m2, m0
> - psllw m3, m0
> - psllw m4, m0
> - movu [r0 + r2 * 2], m1
> - lea r0, [r0 + 2 * r2]
> - movu [r0 + r2], m2
> - movu [r0 + 2 * r2], m3
> - lea r0, [r0 + 2 * r2]
> - movu [r0 + r2], m4
> + mova m2, [r1 + 4 * mmsize]
> + mova m3, [r1 + 5 * mmsize]
> + mova m4, [r1 + 6 * mmsize]
> + mova m5, [r1 + 7 * mmsize]
> + lea r0, [r0 + r2 * 4]
> + psubw m2, m1
> + psubw m3, m1
> + psubw m4, m1
> + psubw m5, m1
> + psraw m2, m0
> + psraw m3, m0
> + psraw m4, m0
> + psraw m5, m0
> + mova [r0], m2
> + mova [r0 + r2], m3
> + mova [r0 + r2 * 2], m4
> + mova [r0 + r3], m5
> RET
>
> +
> +INIT_YMM avx2
> +cglobal cpy1Dto2D_shr_8, 3, 4, 4
> + add r2d, r2d
> + movd xm0, r3m
> + pcmpeqw m1, m1
> + psllw m1, xm0
> + psraw m1, 1
> + lea r3, [r2 * 3]
> +
> + ; Row 0-3
> + movu m2, [r1 + 0 * mmsize]
> + movu m3, [r1 + 1 * mmsize]
> + psubw m2, m1
> + psubw m3, m1
> + psraw m2, xm0
> + psraw m3, xm0
> + movu [r0], xm2
> + vextracti128 [r0 + r2], m2, 1
> + movu [r0 + r2 * 2], xm3
> + vextracti128 [r0 + r3], m3, 1
> +
> + ; Row 4-7
> + movu m2, [r1 + 2 * mmsize]
> + movu m3, [r1 + 3 * mmsize]
> + lea r0, [r0 + r2 * 4]
> + psubw m2, m1
> + psubw m3, m1
> + psraw m2, xm0
> + psraw m3, xm0
> + movu [r0], xm2
> + vextracti128 [r0 + r2], m2, 1
> + movu [r0 + r2 * 2], xm3
> + vextracti128 [r0 + r3], m3, 1
> + RET
> +
> +
> ;--------------------------------------------------------------------------------------
> -; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
> +; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal copy_shl_16, 3,4,5
> +cglobal cpy1Dto2D_shr_16, 3, 5, 6
> add r2d, r2d
> movd m0, r3m
> - mov r3d, 256/64
> + pcmpeqw m1, m1
> + psllw m1, m0
> + psraw m1, 1
> + mov r3d, 16/4
> + lea r4, [r2 * 3]
>
> .loop:
> - ; Row 0-3
> - movu m1, [r1 + 0 * mmsize]
> - movu m2, [r1 + 1 * mmsize]
> - movu m3, [r1 + 2 * mmsize]
> - movu m4, [r1 + 3 * mmsize]
> + ; Row 0-1
> + mova m2, [r1 + 0 * mmsize]
> + mova m3, [r1 + 1 * mmsize]
> + mova m4, [r1 + 2 * mmsize]
> + mova m5, [r1 + 3 * mmsize]
> + psubw m2, m1
> + psubw m3, m1
> + psubw m4, m1
> + psubw m5, m1
> + psraw m2, m0
> + psraw m3, m0
> + psraw m4, m0
> + psraw m5, m0
> + mova [r0], m2
> + mova [r0 + mmsize], m3
> + mova [r0 + r2], m4
> + mova [r0 + r2 + mmsize], m5
> +
> + ; Row 2-3
> + mova m2, [r1 + 4 * mmsize]
> + mova m3, [r1 + 5 * mmsize]
> + mova m4, [r1 + 6 * mmsize]
> + mova m5, [r1 + 7 * mmsize]
> + psubw m2, m1
> + psubw m3, m1
> + psubw m4, m1
> + psubw m5, m1
> + psraw m2, m0
> + psraw m3, m0
> + psraw m4, m0
> + psraw m5, m0
> + mova [r0 + r2 * 2], m2
> + mova [r0 + r2 * 2 + mmsize], m3
> + mova [r0 + r4], m4
> + mova [r0 + r4 + mmsize], m5
> +
> + add r1, 8 * mmsize
> + lea r0, [r0 + r2 * 4]
> + dec r3d
> + jnz .loop
> + RET
> +
> +
> +INIT_YMM avx2
> +cglobal cpy1Dto2D_shr_16, 3, 5, 4
> + add r2d, r2d
> + movd xm0, r3m
> + pcmpeqw m1, m1
> + psllw m1, xm0
> + psraw m1, 1
> + mov r3d, 16/4
> + lea r4, [r2 * 3]
> +
> +.loop:
> + ; Row 0-1
> + movu m2, [r1 + 0 * mmsize]
> + movu m3, [r1 + 1 * mmsize]
> + psubw m2, m1
> + psubw m3, m1
> + psraw m2, xm0
> + psraw m3, xm0
> + movu [r0], m2
> + movu [r0 + r2], m3
> +
> + ; Row 2-3
> + movu m2, [r1 + 2 * mmsize]
> + movu m3, [r1 + 3 * mmsize]
> + psubw m2, m1
> + psubw m3, m1
> + psraw m2, xm0
> + psraw m3, xm0
> + movu [r0 + r2 * 2], m2
> + movu [r0 + r4], m3
> +
> + add r1, 4 * mmsize
> + lea r0, [r0 + r2 * 4]
> + dec r3d
> + jnz .loop
> + RET
> +
> +
> +;--------------------------------------------------------------------------------------
> +; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
> +;--------------------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cpy1Dto2D_shr_32, 3, 4, 6
> + add r2d, r2d
> + movd m0, r3m
> + pcmpeqw m1, m1
> psllw m1, m0
> - psllw m2, m0
> - psllw m3, m0
> - psllw m4, m0
> - movu [r0], m1
> - movu [r0 + 16], m2
> - movu [r0 + r2], m3
> - movu [r0 + r2 + 16], m4
> -
> - ; Row 4-7
> - movu m1, [r1 + 4 * mmsize]
> - movu m2, [r1 + 5 * mmsize]
> - movu m3, [r1 + 6 * mmsize]
> - movu m4, [r1 + 7 * mmsize]
> - psllw m1, m0
> - psllw m2, m0
> - psllw m3, m0
> - psllw m4, m0
> - movu [r0 + r2 * 2], m1
> - movu [r0 + r2 * 2 + 16], m2
> - lea r0, [r0 + r2 * 2]
> - movu [r0 + r2], m3
> - movu [r0 + r2 + 16], m4
> + psraw m1, 1
> + mov r3d, 32/2
> +
> +.loop:
> + ; Row 0
> + mova m2, [r1 + 0 * mmsize]
> + mova m3, [r1 + 1 * mmsize]
> + mova m4, [r1 + 2 * mmsize]
> + mova m5, [r1 + 3 * mmsize]
> + psubw m2, m1
> + psubw m3, m1
> + psubw m4, m1
> + psubw m5, m1
> + psraw m2, m0
> + psraw m3, m0
> + psraw m4, m0
> + psraw m5, m0
> + mova [r0 + 0 * mmsize], m2
> + mova [r0 + 1 * mmsize], m3
> + mova [r0 + 2 * mmsize], m4
> + mova [r0 + 3 * mmsize], m5
> +
> + ; Row 1
> + mova m2, [r1 + 4 * mmsize]
> + mova m3, [r1 + 5 * mmsize]
> + mova m4, [r1 + 6 * mmsize]
> + mova m5, [r1 + 7 * mmsize]
> + psubw m2, m1
> + psubw m3, m1
> + psubw m4, m1
> + psubw m5, m1
> + psraw m2, m0
> + psraw m3, m0
> + psraw m4, m0
> + psraw m5, m0
> + mova [r0 + r2 + 0 * mmsize], m2
> + mova [r0 + r2 + 1 * mmsize], m3
> + mova [r0 + r2 + 2 * mmsize], m4
> + mova [r0 + r2 + 3 * mmsize], m5
>
> add r1, 8 * mmsize
> lea r0, [r0 + r2 * 2]
> @@ -4804,45 +4898,36 @@
> jnz .loop
> RET
>
> -;--------------------------------------------------------------------------------------
> -; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
> -;--------------------------------------------------------------------------------------
> -INIT_XMM sse2
> -cglobal copy_shl_32, 3,4,5
> +
> +INIT_YMM avx2
> +cglobal cpy1Dto2D_shr_32, 3, 4, 6
> add r2d, r2d
> - movd m0, r3m
> - mov r3d, 1024/64
> + movd xm0, r3m
> + pcmpeqw m1, m1
> + psllw m1, xm0
> + psraw m1, 1
> + mov r3d, 32/2
>
> .loop:
> - ; Row 0-3
> - movu m1, [r1 + 0 * mmsize]
> - movu m2, [r1 + 1 * mmsize]
> - movu m3, [r1 + 2 * mmsize]
> - movu m4, [r1 + 3 * mmsize]
> - psllw m1, m0
> - psllw m2, m0
> - psllw m3, m0
> - psllw m4, m0
> - movu [r0], m1
> - movu [r0 + 16], m2
> - movu [r0 + 32], m3
> - movu [r0 + 48], m4
> -
> - ; Row 4-7
> - movu m1, [r1 + 4 * mmsize]
> - movu m2, [r1 + 5 * mmsize]
> - movu m3, [r1 + 6 * mmsize]
> - movu m4, [r1 + 7 * mmsize]
> - psllw m1, m0
> - psllw m2, m0
> - psllw m3, m0
> - psllw m4, m0
> - movu [r0 + r2], m1
> - movu [r0 + r2 + 16], m2
> - movu [r0 + r2 + 32], m3
> - movu [r0 + r2 + 48], m4
> -
> - add r1, 8 * mmsize
> + ; Row 0-1
> + movu m2, [r1 + 0 * mmsize]
> + movu m3, [r1 + 1 * mmsize]
> + movu m4, [r1 + 2 * mmsize]
> + movu m5, [r1 + 3 * mmsize]
> + psubw m2, m1
> + psubw m3, m1
> + psubw m4, m1
> + psubw m5, m1
> + psraw m2, xm0
> + psraw m3, xm0
> + psraw m4, xm0
> + psraw m5, xm0
> + movu [r0], m2
> + movu [r0 + mmsize], m3
> + movu [r0 + r2], m4
> + movu [r0 + r2 + mmsize], m5
> +
> + add r1, 4 * mmsize
> lea r0, [r0 + r2 * 2]
> dec r3d
> jnz .loop
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/blockcopy8.h
> --- a/source/common/x86/blockcopy8.h Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/x86/blockcopy8.h Thu Nov 27 10:12:03 2014 +0900
> @@ -24,32 +24,38 @@
> #ifndef X265_BLOCKCOPY8_H
> #define X265_BLOCKCOPY8_H
>
> -void x265_cvt32to16_shl_4_sse2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_cvt32to16_shl_8_sse2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_cvt32to16_shl_16_sse2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_cvt32to16_shl_32_sse2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_cvt32to16_shl_4_avx2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_cvt32to16_shl_8_avx2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_cvt32to16_shl_16_avx2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_cvt32to16_shl_32_avx2(int16_t* dst, const int* src, intptr_t, int);
> -void x265_copy16to16_shl_sse2(int16_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
> -void x265_cvt16to32_shr_4_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
> -void x265_cvt16to32_shr_8_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
> -void x265_cvt16to32_shr_16_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
> -void x265_cvt16to32_shr_32_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
> -void x265_copy_shr_sse4(int16_t* dst, const int16_t* src, intptr_t, int, int);
> -void x265_copy_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
> -void x265_copy_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
> -void x265_copy_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
> -void x265_copy_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
> -uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t);
> -uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t);
> -uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t);
> -uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t);
> -uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t);
> -uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t);
> -uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t);
> -uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t);
> +void x265_cpy2Dto1D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy2Dto1D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy2Dto1D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy2Dto1D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy2Dto1D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy2Dto1D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy2Dto1D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy2Dto1D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy1Dto2D_shl_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy1Dto2D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy1Dto2D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy1Dto2D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy1Dto2D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shr_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> +void x265_cpy1Dto2D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
> +uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
> +uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
> +uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
> +uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
> +uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
> +uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
> +uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
> +uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
>
> #define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
> void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
> @@ -181,17 +187,17 @@
> void x265_blockfill_s_8x8_sse2(int16_t* dst, intptr_t dstride, int16_t val);
> void x265_blockfill_s_16x16_sse2(int16_t* dst, intptr_t dstride, int16_t val);
> void x265_blockfill_s_32x32_sse2(int16_t* dst, intptr_t dstride, int16_t val);
> -void x265_blockcopy_ss_16x4_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_16x8_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_16x12_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_16x16_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_16x24_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_16x32_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_16x64_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_64x16_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_64x32_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_64x48_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> -void x265_blockcopy_ss_64x64_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
> +void x265_blockcopy_ss_16x4_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_16x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_16x12_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_16x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_16x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_16x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_16x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_64x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_64x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_64x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
> +void x265_blockcopy_ss_64x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
>
> void x265_blockcopy_pp_32x8_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
> void x265_blockcopy_pp_32x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/dct8.asm
> --- a/source/common/x86/dct8.asm Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/x86/dct8.asm Thu Nov 27 10:12:03 2014 +0900
> @@ -318,7 +318,7 @@
> cextern pw_ppppmmmm
>
> ;------------------------------------------------------
> -;void dct4(int16_t *src, int16_t *dst, intptr_t stride)
> +;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
> ;------------------------------------------------------
> INIT_XMM sse2
> cglobal dct4, 3, 4, 8
> @@ -475,7 +475,7 @@
> RET
>
> ;-------------------------------------------------------
> -;void idct4(int16_t *src, int16_t *dst, intptr_t stride)
> +;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
> ;-------------------------------------------------------
> INIT_XMM sse2
> cglobal idct4, 3, 4, 7
> @@ -565,7 +565,7 @@
> RET
>
> ;------------------------------------------------------
> -;void dst4(int16_t *src, int16_t *dst, intptr_t stride)
> +;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
> ;------------------------------------------------------
> INIT_XMM ssse3
> %if ARCH_X86_64
> @@ -657,7 +657,7 @@
> RET
>
> ;-------------------------------------------------------
> -;void idst4(int16_t *src, int16_t *dst, intptr_t stride)
> +;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
> ;-------------------------------------------------------
> INIT_XMM sse2
> cglobal idst4, 3, 4, 7
> @@ -750,7 +750,7 @@
>
>
> ;-------------------------------------------------------
> -; void dct8(int16_t *src, int16_t *dst, intptr_t stride)
> +; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
> ;-------------------------------------------------------
> INIT_XMM sse4
> cglobal dct8, 3,6,7,0-16*mmsize
> @@ -974,7 +974,7 @@
> RET
>
> ;-------------------------------------------------------
> -; void idct8(int16_t *src, int16_t *dst, intptr_t stride)
> +; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
> ;-------------------------------------------------------
> INIT_XMM ssse3
>
> @@ -1164,7 +1164,7 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void denoise_dct(int16_t *dct, uint32_t *sum, uint16_t *offset, int size)
> +; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal denoise_dct, 4, 4, 6
> @@ -2106,7 +2106,7 @@
> %endmacro
>
> ;-------------------------------------------------------
> -; void idct16(int16_t *src, int16_t *dst, intptr_t stride)
> +; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
> ;-------------------------------------------------------
> INIT_YMM avx2
> cglobal idct16, 3, 7, 16, 0-16*mmsize
> @@ -2385,7 +2385,7 @@
> %endmacro
>
> ;-------------------------------------------------------
> -; void idct32(int16_t *src, int16_t *dst, intptr_t stride)
> +; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
> ;-------------------------------------------------------
>
> ; TODO: Reduce PHADDD instruction by PADDD
> @@ -2684,7 +2684,7 @@
> RET
>
> ;-------------------------------------------------------
> -; void idct4(int16_t *src, int16_t *dst, intptr_t stride)
> +; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
> ;-------------------------------------------------------
> INIT_YMM avx2
> cglobal idct4, 3, 4, 6
> diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/dct8.h
> --- a/source/common/x86/dct8.h Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/common/x86/dct8.h Thu Nov 27 10:12:03 2014 +0900
> @@ -23,21 +23,21 @@
>
> #ifndef X265_DCT8_H
> #define X265_DCT8_H
> -void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_dct16_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_idct32_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> +void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
> +void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
> +void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
> +void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
> +void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
> +void x265_dct16_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
> +void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
>
> -void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> -void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
> +void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
> +void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
> +void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
> +void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t dstStride);
> +void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
> +void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
> +void x265_idct32_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
>
> void x265_denoise_dct_sse4(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
> void x265_denoise_dct_avx2(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
> diff -r dfe0803ae6be -r b4454aa1b6ab source/encoder/search.cpp
> --- a/source/encoder/search.cpp Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/encoder/search.cpp Thu Nov 27 10:12:03 2014 +0900
> @@ -2211,8 +2211,8 @@
> if (bTryZero)
> {
> /* coincident blocks of the two reference pictures */
> - const pixel *ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
> - const pixel *ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
> + const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
> + const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
> intptr_t refStride = slice->m_mref[0][0].lumaStride;
>
> primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
> diff -r dfe0803ae6be -r b4454aa1b6ab source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/test/pixelharness.cpp Thu Nov 27 10:12:03 2014 +0900
> @@ -344,60 +344,7 @@
> return true;
> }
>
> -bool PixelHarness::check_copy16to16_shl_t(cpy16to16_shl_t ref, cpy16to16_shl_t opt)
> -{
> - ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
> - ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
> -
> - int j = 0;
> - intptr_t stride = STRIDE;
> - for (int i = 0; i < ITERS; i++)
> - {
> - int shift = (rand() % 7 + 1);
> -
> - int index = i % TEST_CASES;
> - checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
> - ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
> -
> - if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
> - return false;
> -
> - reportfail();
> - j += INCR;
> - }
> -
> - return true;
> -}
> -
> -bool PixelHarness::check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt)
> -{
> - ALIGN_VAR_16(int32_t, ref_dest[64 * 64]);
> - ALIGN_VAR_16(int32_t, opt_dest[64 * 64]);
> -
> - memset(ref_dest, 0xCD, sizeof(ref_dest));
> - memset(opt_dest, 0xCD, sizeof(opt_dest));
> -
> - int j = 0;
> - intptr_t stride = STRIDE;
> - for (int i = 0; i < ITERS; i++)
> - {
> - int shift = (rand() % 7 + 1);
> -
> - int index = i % TEST_CASES;
> - checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
> - ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
> -
> - if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
> - return false;
> -
> - reportfail();
> - j += INCR;
> - }
> -
> - return true;
> -}
> -
> -bool PixelHarness::check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt)
> +bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)
> {
> ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
> ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
> @@ -412,8 +359,36 @@
> int shift = (rand() % 7 + 1);
>
> int index = i % TEST_CASES;
> - checked(opt, opt_dest, int_test_buff[index] + j, stride, shift);
> - ref(ref_dest, int_test_buff[index] + j, stride, shift);
> + checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
> + ref(ref_dest, short_test_buff[index] + j, stride, shift);
> +
> + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
> + return false;
> +
> + reportfail();
> + j += INCR;
> + }
> +
> + return true;
> +}
> +
> +bool PixelHarness::check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt)
> +{
> + ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
> + ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
> +
> + memset(ref_dest, 0xCD, sizeof(ref_dest));
> + memset(opt_dest, 0xCD, sizeof(opt_dest));
> +
> + int j = 0;
> + intptr_t stride = STRIDE;
> + for (int i = 0; i < ITERS; i++)
> + {
> + int shift = (rand() % 7 + 1);
> +
> + int index = i % TEST_CASES;
> + checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
> + ref(ref_dest, short_test_buff[index] + j, stride, shift);
>
> if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
> return false;
> @@ -451,7 +426,7 @@
> return true;
> }
>
> -bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt)
> +bool PixelHarness::check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
> {
> ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
> ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
> @@ -466,8 +441,8 @@
> int shift = (rand() % 7 + 1);
>
> int index = i % TEST_CASES;
> - checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
> - ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
> + checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
> + ref(ref_dest, short_test_buff[index] + j, stride, shift);
>
> if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
> return false;
> @@ -479,7 +454,7 @@
> return true;
> }
>
> -bool PixelHarness::check_copy_shl_t(copy_shl_t ref, copy_shl_t opt)
> +bool PixelHarness::check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt)
> {
> ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
> ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
> @@ -1280,41 +1255,40 @@
> }
> }
>
> - if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i])
> + if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shl[i])
> {
> - if (!check_cvt16to32_shr_t(ref.cvt16to32_shr[i], opt.cvt16to32_shr[i]))
> + if (!check_cpy2Dto1D_shl_t(ref.cpy2Dto1D_shl[i], opt.cpy2Dto1D_shl[i]))
> {
> - printf("cvt16to32_shr failed!\n");
> + printf("cpy2Dto1D_shl failed!\n");
> return false;
> }
> }
>
> - if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i])
> + if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shr[i])
> {
> - if (!check_cvt32to16_shl_t(ref.cvt32to16_shl[i], opt.cvt32to16_shl[i]))
> + if (!check_cpy2Dto1D_shr_t(ref.cpy2Dto1D_shr[i], opt.cpy2Dto1D_shr[i]))
> {
> - printf("cvt32to16_shl failed!\n");
> + printf("cpy2Dto1D_shr failed!\n");
> return false;
> }
> }
>
> - if ((i < BLOCK_64x64) && opt.copy_shl[i])
> + if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shl[i])
> {
> - if (!check_copy_shl_t(ref.copy_shl[i], opt.copy_shl[i]))
> + if (!check_cpy1Dto2D_shl_t(ref.cpy1Dto2D_shl[i], opt.cpy1Dto2D_shl[i]))
> {
> - printf("copy_shl[%dx%d] failed!\n", 4 << i, 4 << i);
> + printf("cpy1Dto2D_shl[%dx%d] failed!\n", 4 << i, 4 << i);
> return false;
> }
> }
>
> - }
> -
> - if (opt.cpy16to16_shl)
> - {
> - if (!check_copy16to16_shl_t(ref.cpy16to16_shl, opt.cpy16to16_shl))
> + if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shr[i])
> {
> - printf("copy16to16_shl failed!\n");
> - return false;
> + if (!check_cpy1Dto2D_shr_t(ref.cpy1Dto2D_shr[i], opt.cpy1Dto2D_shr[i]))
> + {
> + printf("cpy1Dto2D_shr[%dx%d] failed!\n", 4 << i, 4 << i);
> + return false;
> + }
> }
> }
>
> @@ -1408,15 +1382,6 @@
> }
> }
>
> - if (opt.copy_shr)
> - {
> - if (!check_copy_shr_t(ref.copy_shr, opt.copy_shr))
> - {
> - printf("copy_shr failed!\n");
> - return false;
> - }
> - }
> -
> return true;
> }
>
> @@ -1637,16 +1602,28 @@
> REPORT_SPEEDUP(opt.var[i], ref.var[i], pbuf1, STRIDE);
> }
>
> - if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i])
> + if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shl[i])
> {
> - HEADER("cvt16to32_shr[%dx%d]", 4 << i, 4 << i);
> - REPORT_SPEEDUP(opt.cvt16to32_shr[i], ref.cvt16to32_shr[i], ibuf1, sbuf2, STRIDE, 3, 4);
> + HEADER("cpy2Dto1D_shl[%dx%d]", 4 << i, 4 << i);
> + REPORT_SPEEDUP(opt.cpy2Dto1D_shl[i], ref.cpy2Dto1D_shl[i], sbuf1, sbuf2, STRIDE, MAX_TR_DYNAMIC_RANGE - X265_DEPTH - (i + 2));
> }
>
> - if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i])
> + if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shr[i])
> {
> - HEADER("cvt32to16_shl[%dx%d]", 4 << i, 4 << i);
> - REPORT_SPEEDUP(opt.cvt32to16_shl[i], ref.cvt32to16_shl[i], sbuf2, ibuf1, STRIDE, 3);
> + HEADER("cpy2Dto1D_shr[%dx%d]", 4 << i, 4 << i);
> + REPORT_SPEEDUP(opt.cpy2Dto1D_shr[i], ref.cpy2Dto1D_shr[i], sbuf1, sbuf2, STRIDE, 3);
> + }
> +
> + if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shl[i])
> + {
> + HEADER("cpy1Dto2D_shl[%dx%d]", 4 << i, 4 << i);
> + REPORT_SPEEDUP(opt.cpy1Dto2D_shl[i], ref.cpy1Dto2D_shl[i], sbuf1, sbuf2, STRIDE, 64);
> + }
> +
> + if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shr[i])
> + {
> + HEADER("cpy1Dto2D_shr[%dx%d]", 4 << i, 4 << i);
> + REPORT_SPEEDUP(opt.cpy1Dto2D_shr[i], ref.cpy1Dto2D_shr[i], sbuf1, sbuf2, STRIDE, 64);
> }
>
> if ((i < BLOCK_64x64) && opt.copy_cnt[i])
> @@ -1654,19 +1631,6 @@
> HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i);
> REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE);
> }
> -
> - if ((i < BLOCK_64x64) && opt.copy_shl[i])
> - {
> - HEADER("copy_shl[%dx%d]", 4 << i, 4 << i);
> - REPORT_SPEEDUP(opt.copy_shl[i], ref.copy_shl[i], sbuf1, sbuf2, STRIDE, 64);
> - }
> -
> - }
> -
> - if (opt.cpy16to16_shl)
> - {
> - HEADER0("cpy16to16_shl");
> - REPORT_SPEEDUP(opt.cpy16to16_shl, ref.cpy16to16_shl, sbuf2, sbuf1, 64, 5, 64);
> }
>
> if (opt.weight_pp)
> @@ -1728,11 +1692,4 @@
> HEADER0("planecopy_cp");
> REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1, 64, 64, 64, 2);
> }
> -
> - if (opt.copy_shr)
> - {
> - HEADER0("copy_shr");
> - REPORT_SPEEDUP(opt.copy_shr, ref.copy_shr, sbuf1, sbuf2, 64, 5, 64);
> - }
> -
> }
> diff -r dfe0803ae6be -r b4454aa1b6ab source/test/pixelharness.h
> --- a/source/test/pixelharness.h Wed Nov 26 16:56:00 2014 -0600
> +++ b/source/test/pixelharness.h Thu Nov 27 10:12:03 2014 +0900
> @@ -80,12 +80,11 @@
> bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);
> bool check_weightp(weightp_sp_t ref, weightp_sp_t opt);
> bool check_downscale_t(downscale_t ref, downscale_t opt);
> - bool check_copy16to16_shl_t(cpy16to16_shl_t ref, cpy16to16_shl_t opt);
> - bool check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt);
> - bool check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt);
> + bool check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt);
> + bool check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt);
> + bool check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
> + bool check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt);
> bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
> - bool check_copy_shr_t(copy_shr_t ref, copy_shr_t opt);
> - bool check_copy_shl_t(copy_shl_t ref, copy_shl_t opt);
> bool check_pixel_var(var_t ref, var_t opt);
> bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
> bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list