[x265] [PATCH] refactorizaton of the transform/quant path
Steve Borho
steve at borho.org
Tue Nov 18 18:55:15 CET 2014
On 11/18, praveen at multicorewareinc.com wrote:
> # HG changeset patch
> # User Praveen Tiwari
> # Date 1416299427 -19800
> # Node ID 706fa4af912bc1610478de8f09a651ae3e58624c
> # Parent 2f0062f0791b822fa932712a56e6b0a14e976d91
> refactorizaton of the transform/quant path.
Queued with white-space reflowing of the commit message
> This patch involves scaling down the DCT/IDCT coefficients from int32_t to int16_t
> as they can be accommodated on int16_t without any introduction of encode error,
> this allows us to clean up lots of DCT/IDCT intermediated buffers, optimize enode efficiency for different
> cli options including noise reduction by reducing data movement operations, accommodating more number of
> coefficients in a single register for SIMD operations. This patch include all necessary
> changes for the transfor/quant path including unit test code.
>
> diff -r 2f0062f0791b -r 706fa4af912b source/common/dct.cpp
> --- a/source/common/dct.cpp Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/dct.cpp Tue Nov 18 14:00:27 2014 +0530
> @@ -440,7 +440,7 @@
> }
> }
>
> -void dst4_c(int16_t *src, int32_t *dst, intptr_t stride)
> +void dst4_c(int16_t *src, int16_t *dst, intptr_t stride)
> {
> const int shift_1st = 1 + X265_DEPTH - 8;
> const int shift_2nd = 8;
> @@ -454,132 +454,58 @@
> }
>
> fastForwardDst(block, coef, shift_1st);
> - fastForwardDst(coef, block, shift_2nd);
> -
> -#define N (4)
> - for (int i = 0; i < N; i++)
> - {
> - for (int j = 0; j < N; j++)
> - {
> - dst[i * N + j] = block[i * N + j];
> - }
> - }
> -
> -#undef N
> + fastForwardDst(coef, dst, shift_2nd);
> }
>
> -void dct4_c(int16_t *src, int32_t *dst, intptr_t stride)
> +void dct4_c(int16_t *src, int16_t *dst, intptr_t stride)
> {
> + stride; // To eliminate warnings and match the interface with asm code.
> const int shift_1st = 1 + X265_DEPTH - 8;
> const int shift_2nd = 8;
>
> ALIGN_VAR_32(int16_t, coef[4 * 4]);
> - ALIGN_VAR_32(int16_t, block[4 * 4]);
>
> - for (int i = 0; i < 4; i++)
> - {
> - memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
> - }
> -
> - partialButterfly4(block, coef, shift_1st, 4);
> - partialButterfly4(coef, block, shift_2nd, 4);
> -#define N (4)
> - for (int i = 0; i < N; i++)
> - {
> - for (int j = 0; j < N; j++)
> - {
> - dst[i * N + j] = block[i * N + j];
> - }
> - }
> -
> -#undef N
> + partialButterfly4(src, coef, shift_1st, 4);
> + partialButterfly4(coef, dst, shift_2nd, 4);
> }
>
> -void dct8_c(int16_t *src, int32_t *dst, intptr_t stride)
> +void dct8_c(int16_t *src, int16_t *dst, intptr_t stride)
> {
> + stride; // To eliminate warnings and match the interface with asm code.
> const int shift_1st = 2 + X265_DEPTH - 8;
> const int shift_2nd = 9;
>
> ALIGN_VAR_32(int16_t, coef[8 * 8]);
> - ALIGN_VAR_32(int16_t, block[8 * 8]);
>
> - for (int i = 0; i < 8; i++)
> - {
> - memcpy(&block[i * 8], &src[i * stride], 8 * sizeof(int16_t));
> - }
> -
> - partialButterfly8(block, coef, shift_1st, 8);
> - partialButterfly8(coef, block, shift_2nd, 8);
> -
> -#define N (8)
> - for (int i = 0; i < N; i++)
> - {
> - for (int j = 0; j < N; j++)
> - {
> - dst[i * N + j] = block[i * N + j];
> - }
> - }
> -
> -#undef N
> + partialButterfly8(src, coef, shift_1st, 8);
> + partialButterfly8(coef, dst, shift_2nd, 8);
> }
>
> -void dct16_c(int16_t *src, int32_t *dst, intptr_t stride)
> +void dct16_c(int16_t *src, int16_t *dst, intptr_t stride)
> {
> + stride; // To eliminate warnings and match the interface with asm code.
> const int shift_1st = 3 + X265_DEPTH - 8;
> const int shift_2nd = 10;
>
> ALIGN_VAR_32(int16_t, coef[16 * 16]);
> - ALIGN_VAR_32(int16_t, block[16 * 16]);
>
> - for (int i = 0; i < 16; i++)
> - {
> - memcpy(&block[i * 16], &src[i * stride], 16 * sizeof(int16_t));
> - }
> -
> - partialButterfly16(block, coef, shift_1st, 16);
> - partialButterfly16(coef, block, shift_2nd, 16);
> -
> -#define N (16)
> - for (int i = 0; i < N; i++)
> - {
> - for (int j = 0; j < N; j++)
> - {
> - dst[i * N + j] = block[i * N + j];
> - }
> - }
> -
> -#undef N
> + partialButterfly16(src, coef, shift_1st, 16);
> + partialButterfly16(coef, dst, shift_2nd, 16);
> }
>
> -void dct32_c(int16_t *src, int32_t *dst, intptr_t stride)
> +void dct32_c(int16_t *src, int16_t *dst, intptr_t stride)
> {
> + stride; // To eliminate warnings and match the interface with asm code.
> const int shift_1st = 4 + X265_DEPTH - 8;
> const int shift_2nd = 11;
>
> ALIGN_VAR_32(int16_t, coef[32 * 32]);
> - ALIGN_VAR_32(int16_t, block[32 * 32]);
>
> - for (int i = 0; i < 32; i++)
> - {
> - memcpy(&block[i * 32], &src[i * stride], 32 * sizeof(int16_t));
> - }
> -
> - partialButterfly32(block, coef, shift_1st, 32);
> - partialButterfly32(coef, block, shift_2nd, 32);
> -
> -#define N (32)
> - for (int i = 0; i < N; i++)
> - {
> - for (int j = 0; j < N; j++)
> - {
> - dst[i * N + j] = block[i * N + j];
> - }
> - }
> -
> -#undef N
> + partialButterfly32(src, coef, shift_1st, 32);
> + partialButterfly32(coef, dst, shift_2nd, 32);
> }
>
> -void idst4_c(int32_t *src, int16_t *dst, intptr_t stride)
> +void idst4_c(int16_t *src, int16_t *dst, intptr_t stride)
> {
> const int shift_1st = 7;
> const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -587,18 +513,7 @@
> ALIGN_VAR_32(int16_t, coef[4 * 4]);
> ALIGN_VAR_32(int16_t, block[4 * 4]);
>
> -#define N (4)
> - for (int i = 0; i < N; i++)
> - {
> - for (int j = 0; j < N; j++)
> - {
> - block[i * N + j] = (int16_t)src[i * N + j];
> - }
> - }
> -
> -#undef N
> -
> - inversedst(block, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
> + inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
> inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output
>
> for (int i = 0; i < 4; i++)
> @@ -607,7 +522,7 @@
> }
> }
>
> -void idct4_c(int32_t *src, int16_t *dst, intptr_t stride)
> +void idct4_c(int16_t *src, int16_t *dst, intptr_t stride)
> {
> const int shift_1st = 7;
> const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -615,18 +530,7 @@
> ALIGN_VAR_32(int16_t, coef[4 * 4]);
> ALIGN_VAR_32(int16_t, block[4 * 4]);
>
> -#define N (4)
> - for (int i = 0; i < N; i++)
> - {
> - for (int j = 0; j < N; j++)
> - {
> - block[i * N + j] = (int16_t)src[i * N + j];
> - }
> - }
> -
> -#undef N
> -
> - partialButterflyInverse4(block, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
> + partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
> partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
>
> for (int i = 0; i < 4; i++)
> @@ -635,7 +539,7 @@
> }
> }
>
> -void idct8_c(int32_t *src, int16_t *dst, intptr_t stride)
> +void idct8_c(int16_t *src, int16_t *dst, intptr_t stride)
> {
> const int shift_1st = 7;
> const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -643,18 +547,7 @@
> ALIGN_VAR_32(int16_t, coef[8 * 8]);
> ALIGN_VAR_32(int16_t, block[8 * 8]);
>
> -#define N (8)
> - for (int i = 0; i < N; i++)
> - {
> - for (int j = 0; j < N; j++)
> - {
> - block[i * N + j] = (int16_t)src[i * N + j];
> - }
> - }
> -
> -#undef N
> -
> - partialButterflyInverse8(block, coef, shift_1st, 8);
> + partialButterflyInverse8(src, coef, shift_1st, 8);
> partialButterflyInverse8(coef, block, shift_2nd, 8);
> for (int i = 0; i < 8; i++)
> {
> @@ -662,7 +555,7 @@
> }
> }
>
> -void idct16_c(int32_t *src, int16_t *dst, intptr_t stride)
> +void idct16_c(int16_t *src, int16_t *dst, intptr_t stride)
> {
> const int shift_1st = 7;
> const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -670,18 +563,7 @@
> ALIGN_VAR_32(int16_t, coef[16 * 16]);
> ALIGN_VAR_32(int16_t, block[16 * 16]);
>
> -#define N (16)
> - for (int i = 0; i < N; i++)
> - {
> - for (int j = 0; j < N; j++)
> - {
> - block[i * N + j] = (int16_t)src[i * N + j];
> - }
> - }
> -
> -#undef N
> -
> - partialButterflyInverse16(block, coef, shift_1st, 16);
> + partialButterflyInverse16(src, coef, shift_1st, 16);
> partialButterflyInverse16(coef, block, shift_2nd, 16);
> for (int i = 0; i < 16; i++)
> {
> @@ -689,7 +571,7 @@
> }
> }
>
> -void idct32_c(int32_t *src, int16_t *dst, intptr_t stride)
> +void idct32_c(int16_t *src, int16_t *dst, intptr_t stride)
> {
> const int shift_1st = 7;
> const int shift_2nd = 12 - (X265_DEPTH - 8);
> @@ -697,18 +579,7 @@
> ALIGN_VAR_32(int16_t, coef[32 * 32]);
> ALIGN_VAR_32(int16_t, block[32 * 32]);
>
> -#define N (32)
> - for (int i = 0; i < N; i++)
> - {
> - for (int j = 0; j < N; j++)
> - {
> - block[i * N + j] = (int16_t)src[i * N + j];
> - }
> - }
> -
> -#undef N
> -
> - partialButterflyInverse32(block, coef, shift_1st, 32);
> + partialButterflyInverse32(src, coef, shift_1st, 32);
> partialButterflyInverse32(coef, block, shift_2nd, 32);
>
> for (int i = 0; i < 32; i++)
> @@ -717,7 +588,7 @@
> }
> }
>
> -void dequant_normal_c(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
> +void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
> {
> #if HIGH_BIT_DEPTH
> X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > 2), "dequant invalid scale %d\n", scale);
> @@ -737,11 +608,11 @@
> for (int n = 0; n < num; n++)
> {
> coeffQ = (quantCoef[n] * scale + add) >> shift;
> - coef[n] = Clip3(-32768, 32767, coeffQ);
> + coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ);
> }
> }
>
> -void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
> +void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int16_t* coef, int num, int per, int shift)
> {
> X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
>
> @@ -756,7 +627,7 @@
> for (int n = 0; n < num; n++)
> {
> coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
> - coef[n] = Clip3(-32768, 32767, coeffQ);
> + coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ);
> }
> }
> else
> @@ -764,12 +635,12 @@
> for (int n = 0; n < num; n++)
> {
> coeffQ = Clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
> - coef[n] = Clip3(-32768, 32767, coeffQ << (per - shift));
> + coef[n] = (int16_t)Clip3(-32768, 32767, coeffQ << (per - shift));
> }
> }
> }
>
> -uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
> +uint32_t quant_c(int16_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
> {
> X265_CHECK(qBits >= 8, "qBits less than 8\n");
> X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
> @@ -793,7 +664,7 @@
> return numSig;
> }
>
> -uint32_t nquant_c(int32_t* coef, int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
> +uint32_t nquant_c(int16_t* coef, int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
> {
> X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
> X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
> @@ -848,7 +719,7 @@
> return numSig;
> }
>
> -void denoiseDct_c(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
> +void denoiseDct_c(int16_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff)
> {
> for (int i = 0; i < numCoeff; i++)
> {
> @@ -857,7 +728,7 @@
> level = (level + sign) ^ sign;
> resSum[i] += level;
> level -= offset[i];
> - dctCoef[i] = level < 0 ? 0 : (level ^ sign) - sign;
> + dctCoef[i] = (int16_t)(level < 0 ? 0 : (level ^ sign) - sign);
> }
> }
>
> diff -r 2f0062f0791b -r 706fa4af912b source/common/pixel.cpp
> --- a/source/common/pixel.cpp Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/pixel.cpp Tue Nov 18 14:00:27 2014 +0530
> @@ -491,13 +491,13 @@
> }
> }
>
> -void convert16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size)
> +void copy16to16_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size)
> {
> for (int i = 0; i < size; i++)
> {
> for (int j = 0; j < size; j++)
> {
> - dst[i * size + j] = ((int)src[i * stride + j]) << shift;
> + dst[i * size + j] = (src[i * stride + j]) << shift;
> }
> }
> }
> @@ -514,22 +514,6 @@
> }
> }
>
> -void convert32to16_shr(int16_t *dst, int32_t *src, intptr_t stride, int shift, int size)
> -{
> - int round = 1 << (shift - 1);
> -
> - for (int i = 0; i < size; i++)
> - {
> - for (int j = 0; j < size; j++)
> - {
> - dst[j] = (int16_t)((src[j] + round) >> shift);
> - }
> -
> - src += size;
> - dst += stride;
> - }
> -}
> -
> void copy_shr(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size)
> {
> int round = 1 << (shift - 1);
> @@ -1288,12 +1272,11 @@
> p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
> p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
>
> - p.cvt16to32_shl = convert16to32_shl;
> + p.cpy16to16_shl = copy16to16_shl;
> p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>;
> p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>;
> p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>;
> p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>;
> - p.cvt32to16_shr = convert32to16_shr;
> p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>;
> p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>;
> p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>;
> diff -r 2f0062f0791b -r 706fa4af912b source/common/primitives.h
> --- a/source/common/primitives.h Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/primitives.h Tue Nov 18 14:00:27 2014 +0530
> @@ -147,24 +147,23 @@
> typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter);
> typedef void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma);
>
> -typedef void (*cvt16to32_shl_t)(int32_t *dst, int16_t *src, intptr_t, int, int);
> +typedef void (*cpy16to16_shl_t)(int16_t *dst, int16_t *src, intptr_t, int, int);
> typedef void (*cvt16to32_shr_t)(int32_t *dst, int16_t *src, intptr_t, int, int);
> -typedef void (*cvt32to16_shr_t)(int16_t *dst, int32_t *src, intptr_t, int, int);
> typedef void (*cvt32to16_shl_t)(int16_t *dst, int32_t *src, intptr_t, int);
> typedef uint32_t (*copy_cnt_t)(int16_t* coeff, int16_t* residual, intptr_t stride);
> typedef void (*copy_shr_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
> typedef void (*copy_shl_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift);
>
> -typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
> -typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
> -typedef void (*denoiseDct_t)(int32_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff);
> +typedef void (*dct_t)(int16_t *src, int16_t *dst, intptr_t stride);
> +typedef void (*idct_t)(int16_t *src, int16_t *dst, intptr_t stride);
> +typedef void (*denoiseDct_t)(int16_t* dctCoef, uint32_t* resSum, uint16_t* offset, int numCoeff);
>
> typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
> -typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> -typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> -typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
> -typedef void (*dequant_normal_t)(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
> +typedef uint32_t (*quant_t)(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> +typedef uint32_t (*nquant_t)(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> +typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t *dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
> +typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
> typedef int (*count_nonzero_t)(const int16_t *quantCoeff, int numCoeff);
>
> typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
> @@ -220,9 +219,8 @@
> pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS];
>
> blockfill_s_t blockfill_s[NUM_SQUARE_BLOCKS]; // block fill with value
> - cvt16to32_shl_t cvt16to32_shl;
> + cpy16to16_shl_t cpy16to16_shl;
> cvt16to32_shr_t cvt16to32_shr[NUM_SQUARE_BLOCKS - 1];
> - cvt32to16_shr_t cvt32to16_shr;
> cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1];
> copy_cnt_t copy_cnt[NUM_SQUARE_BLOCKS - 1];
> copy_shr_t copy_shr;
> diff -r 2f0062f0791b -r 706fa4af912b source/common/quant.cpp
> --- a/source/common/quant.cpp Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/quant.cpp Tue Nov 18 14:00:27 2014 +0530
> @@ -166,7 +166,7 @@
> m_useRDOQ = useRDOQ;
> m_psyRdoqScale = (int64_t)(psyScale * 256.0);
> m_scalingList = &scalingList;
> - m_resiDctCoeff = X265_MALLOC(int32_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
> + m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
> m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE);
> m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE);
>
> @@ -340,7 +340,7 @@
> if (useTransformSkip)
> {
> #if X265_DEPTH <= 10
> - primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
> + primitives.cpy16to16_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
> #else
> if (transformShift >= 0)
> primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
> @@ -441,10 +441,10 @@
> int trSize = 1 << log2TrSize;
>
> #if X265_DEPTH <= 10
> - primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
> + primitives.copy_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
> #else
> if (transformShift > 0)
> - primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
> + primitives.copy_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
> else
> primitives.cvt32to16_shl[log2TrSize - 2](residual, m_resiDctCoeff, stride, -transformShift);
> #endif
> diff -r 2f0062f0791b -r 706fa4af912b source/common/quant.h
> --- a/source/common/quant.h Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/quant.h Tue Nov 18 14:00:27 2014 +0530
> @@ -83,8 +83,8 @@
>
> bool m_useRDOQ;
> int64_t m_psyRdoqScale;
> - int32_t* m_resiDctCoeff;
> - int32_t* m_fencDctCoeff;
> + int16_t* m_resiDctCoeff;
> + int16_t* m_fencDctCoeff;
> int16_t* m_fencShortBuf;
>
> enum { IEP_RATE = 32768 }; /* FIX15 cost of an equal probable bit */
> diff -r 2f0062f0791b -r 706fa4af912b source/common/vec/dct-sse3.cpp
> --- a/source/common/vec/dct-sse3.cpp Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/vec/dct-sse3.cpp Tue Nov 18 14:00:27 2014 +0530
> @@ -52,7 +52,7 @@
> { 83, 36, 83, 36, 83, 36, 83, 36 },
> { 36, -83, 36, -83, 36, -83, 36, -83 }
> };
> -void idct8(int32_t *src, int16_t *dst, intptr_t stride)
> +void idct8(int16_t *src, int16_t *dst, intptr_t stride)
> {
> __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
> __m128i T00, T01, T02, T03, T04, T05, T06, T07;
> @@ -305,7 +305,7 @@
> _mm_storeh_pi((__m64*)&dst[7 * stride + 4], _mm_castsi128_ps(T11));
> }
>
> -void idct16(int32_t *src, int16_t *dst, intptr_t stride)
> +void idct16(int16_t *src, int16_t *dst, intptr_t stride)
> {
> const __m128i c16_p87_p90 = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
> const __m128i c16_p70_p80 = _mm_set1_epi32(0x00460050);
> @@ -367,71 +367,22 @@
> for (int i = 0; i < 2; i++)
> {
> const int offset = (i << 3);
> - __m128i T00, T01;
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset + 4]);
> - in00[i] = _mm_packs_epi32(T00, T01); // [07 06 05 04 03 02 01 00]
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset + 4]);
> - in01[i] = _mm_packs_epi32(T00, T01); // [17 16 15 14 13 12 11 10]
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset + 4]);
> - in02[i] = _mm_packs_epi32(T00, T01); // [27 26 25 24 23 22 21 20]
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset + 4]);
> - in03[i] = _mm_packs_epi32(T00, T01); // [37 36 35 34 33 32 31 30]
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset + 4]);
> - in04[i] = _mm_packs_epi32(T00, T01); // [47 46 45 44 43 42 41 40]
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset + 4]);
> - in05[i] = _mm_packs_epi32(T00, T01); // [57 56 55 54 53 52 51 50]
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset + 4]);
> - in06[i] = _mm_packs_epi32(T00, T01); // [67 66 65 64 63 62 61 60]
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset + 4]);
> - in07[i] = _mm_packs_epi32(T00, T01); // [77 76 75 74 73 72 71 70]
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset + 4]);
> - in08[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset + 4]);
> - in09[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset + 4]);
> - in10[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset + 4]);
> - in11[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset + 4]);
> - in12[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset + 4]);
> - in13[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset + 4]);
> - in14[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset + 4]);
> - in15[i] = _mm_packs_epi32(T00, T01);
> + in00[i] = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]); // [07 06 05 04 03 02 01 00]
> + in01[i] = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]); // [17 16 15 14 13 12 11 10]
> + in02[i] = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]); // [27 26 25 24 23 22 21 20]
> + in03[i] = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]); // [37 36 35 34 33 32 31 30]
> + in04[i] = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]); // [47 46 45 44 43 42 41 40]
> + in05[i] = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]); // [57 56 55 54 53 52 51 50]
> + in06[i] = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]); // [67 66 65 64 63 62 61 60]
> + in07[i] = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]); // [77 76 75 74 73 72 71 70]
> + in08[i] = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);
> + in09[i] = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);
> + in10[i] = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]);
> + in11[i] = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]);
> + in12[i] = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]);
> + in13[i] = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]);
> + in14[i] = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]);
> + in15[i] = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]);
> }
>
> for (int pass = 0; pass < 2; pass++)
> @@ -716,7 +667,7 @@
> _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
> }
>
> -void idct32(int32_t *src, int16_t *dst, intptr_t stride)
> +void idct32(int16_t *src, int16_t *dst, intptr_t stride)
> {
> //Odd
> const __m128i c16_p90_p90 = _mm_set1_epi32(0x005A005A); //column 0
> @@ -909,135 +860,38 @@
> for (int i = 0; i < 4; i++)
> {
> const int offset = (i << 3);
> - __m128i T00, T01;
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset + 4]);
> - in00[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset + 4]);
> - in01[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset + 4]);
> - in02[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset + 4]);
> - in03[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset + 4]);
> - in04[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset + 4]);
> - in05[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset + 4]);
> - in06[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset + 4]);
> - in07[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset + 4]);
> - in08[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset + 4]);
> - in09[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset + 4]);
> - in10[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset + 4]);
> - in11[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset + 4]);
> - in12[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset + 4]);
> - in13[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset + 4]);
> - in14[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset + 4]);
> - in15[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset + 4]);
> - in16[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset + 4]);
> - in17[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset + 4]);
> - in18[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset + 4]);
> - in19[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset + 4]);
> - in20[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset + 4]);
> - in21[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset + 4]);
> - in22[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset + 4]);
> - in23[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset + 4]);
> - in24[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset + 4]);
> - in25[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset + 4]);
> - in26[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset + 4]);
> - in27[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset + 4]);
> - in28[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset + 4]);
> - in29[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset + 4]);
> - in30[i] = _mm_packs_epi32(T00, T01);
> -
> - T00 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
> - T01 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset + 4]);
> - in31[i] = _mm_packs_epi32(T00, T01);
> + in00[i] = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
> + in01[i] = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
> + in02[i] = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
> + in03[i] = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
> + in04[i] = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
> + in05[i] = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
> + in06[i] = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
> + in07[i] = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
> + in08[i] = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
> + in09[i] = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
> + in10[i] = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
> + in11[i] = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
> + in12[i] = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
> + in13[i] = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
> + in14[i] = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
> + in15[i] = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
> + in16[i] = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
> + in17[i] = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
> + in18[i] = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
> + in19[i] = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
> + in20[i] = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
> + in21[i] = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
> + in22[i] = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
> + in23[i] = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
> + in24[i] = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
> + in25[i] = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
> + in26[i] = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
> + in27[i] = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
> + in28[i] = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
> + in29[i] = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
> + in30[i] = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
> + in31[i] = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
> }
>
> for (int pass = 0; pass < 2; pass++)
> @@ -1564,7 +1418,7 @@
> * still somewhat rare on end-user PCs we still compile and link these SSE3
> * intrinsic SIMD functions */
> #if !HIGH_BIT_DEPTH
> - p.idct[IDCT_8x8] = idct8;
> +// p.idct[IDCT_8x8] = idct8;
> p.idct[IDCT_16x16] = idct16;
> p.idct[IDCT_32x32] = idct32;
> #endif
> diff -r 2f0062f0791b -r 706fa4af912b source/common/vec/dct-sse41.cpp
> --- a/source/common/vec/dct-sse41.cpp Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/vec/dct-sse41.cpp Tue Nov 18 14:00:27 2014 +0530
> @@ -36,7 +36,7 @@
> using namespace x265;
>
> namespace {
> -void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
> +void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int16_t* coef, int num, int per, int shift)
> {
> X265_CHECK(num <= 32 * 32, "dequant num too large\n");
>
> @@ -66,11 +66,7 @@
> quantCoef2 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef2, deQuantCoef2), IAdd), _mm_cvtsi32_si128(shift - per));
>
> quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
> - sign = _mm_srai_epi16(quantCoef12, 15);
> - quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
> - _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
> - quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
> - _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
> + _mm_storeu_si128((__m128i*)(coef + n), quantCoef12);
> }
> }
> else
> @@ -100,11 +96,7 @@
> quantCoef2 = _mm_sll_epi32(quantCoef2, _mm_cvtsi32_si128(per - shift));
>
> quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
> - sign = _mm_srai_epi16(quantCoef12, 15);
> - quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
> - _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
> - quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
> - _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
> + _mm_storeu_si128((__m128i*)(coef + n), quantCoef12);
> }
> }
> }
> diff -r 2f0062f0791b -r 706fa4af912b source/common/vec/dct-ssse3.cpp
> --- a/source/common/vec/dct-ssse3.cpp Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/vec/dct-ssse3.cpp Tue Nov 18 14:00:27 2014 +0530
> @@ -100,7 +100,7 @@
> #undef MAKE_COEF
> };
>
> -void dct16(int16_t *src, int32_t *dst, intptr_t stride)
> +void dct16(int16_t *src, int16_t *dst, intptr_t stride)
> {
> // Const
> __m128i c_4 = _mm_set1_epi32(4);
> @@ -344,8 +344,10 @@
> T41 = _mm_hsub_epi32(T30, T31);
> T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
> T41 = _mm_srai_epi32(_mm_add_epi32(T41, c_512), 10);
> - _mm_storeu_si128((__m128i*)&dst[0 * 16 + i], T40);
> - _mm_storeu_si128((__m128i*)&dst[8 * 16 + i], T41);
> + T40 = _mm_packs_epi32(T40, T40);
> + T41 = _mm_packs_epi32(T41, T41);
> + _mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40);
> + _mm_storel_epi64((__m128i*)&dst[8 * 16 + i], T41);
>
> T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
> T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
> @@ -366,7 +368,8 @@
>
> T40 = _mm_hadd_epi32(T30, T31);
> T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
> - _mm_storeu_si128((__m128i*)&dst[4 * 16 + i], T40);
> + T40 = _mm_packs_epi32(T40, T40);
> + _mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40);
>
> T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
> T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
> @@ -387,7 +390,8 @@
>
> T40 = _mm_hadd_epi32(T30, T31);
> T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
> - _mm_storeu_si128((__m128i*)&dst[12 * 16 + i], T40);
> + T40 = _mm_packs_epi32(T40, T40);
> + _mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40);
>
> T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
> T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
> @@ -408,7 +412,8 @@
>
> T40 = _mm_hadd_epi32(T30, T31);
> T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
> - _mm_storeu_si128((__m128i*)&dst[2 * 16 + i], T40);
> + T40 = _mm_packs_epi32(T40, T40);
> + _mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40);
>
> T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
> T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
> @@ -429,7 +434,8 @@
>
> T40 = _mm_hadd_epi32(T30, T31);
> T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
> - _mm_storeu_si128((__m128i*)&dst[6 * 16 + i], T40);
> + T40 = _mm_packs_epi32(T40, T40);
> + _mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40);
>
> T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
> T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
> @@ -450,7 +456,8 @@
>
> T40 = _mm_hadd_epi32(T30, T31);
> T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
> - _mm_storeu_si128((__m128i*)&dst[10 * 16 + i], T40);
> + T40 = _mm_packs_epi32(T40, T40);
> + _mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40);
>
> T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
> T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
> @@ -471,7 +478,8 @@
>
> T40 = _mm_hadd_epi32(T30, T31);
> T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
> - _mm_storeu_si128((__m128i*)&dst[14 * 16 + i], T40);
> + T40 = _mm_packs_epi32(T40, T40);
> + _mm_storel_epi64((__m128i*)&dst[14 * 16 + i], T40);
>
> #define MAKE_ODD(tab, dstPos) \
> T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \
> @@ -493,7 +501,8 @@
> \
> T40 = _mm_hadd_epi32(T30, T31); \
> T40 = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); \
> - _mm_storeu_si128((__m128i*)&dst[(dstPos) * 16 + i], T40);
> + T40 = _mm_packs_epi32(T40, T40); \
> + _mm_storel_epi64((__m128i*)&dst[(dstPos) * 16 + i], T40);
>
> MAKE_ODD(14, 1);
> MAKE_ODD(16, 3);
> @@ -657,7 +666,7 @@
> #undef MAKE_COEF16
> };
>
> -void dct32(int16_t *src, int32_t *dst, intptr_t stride)
> +void dct32(int16_t *src, int16_t *dst, intptr_t stride)
> {
> // Const
> __m128i c_8 = _mm_set1_epi32(8);
> @@ -1050,7 +1059,8 @@
> T60 = _mm_hadd_epi32(T60, T61); \
> \
> T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), 11); \
> - _mm_storeu_si128((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
> + T60 = _mm_packs_epi32(T60, T60); \
> + _mm_storel_epi64((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
>
> MAKE_ODD(44, 44, 44, 44, 0);
> MAKE_ODD(45, 45, 45, 45, 16);
> diff -r 2f0062f0791b -r 706fa4af912b source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/x86/asm-primitives.cpp Tue Nov 18 14:00:27 2014 +0530
> @@ -1336,7 +1336,6 @@
> p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
> p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
>
> - p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
> p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
> p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
> p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
> @@ -1407,7 +1406,6 @@
> p.quant = x265_quant_sse4;
> p.nquant = x265_nquant_sse4;
> p.dequant_normal = x265_dequant_normal_sse4;
> - p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
> p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
> p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
> p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
> @@ -1448,7 +1446,6 @@
> p.idct[IDCT_8x8] = x265_idct8_avx2;
> p.idct[IDCT_16x16] = x265_idct16_avx2;
> p.idct[IDCT_32x32] = x265_idct32_avx2;
> -
> p.transpose[BLOCK_8x8] = x265_transpose8_avx2;
> p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
> p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
> @@ -1551,7 +1548,6 @@
> p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
> SA8D_INTER_FROM_BLOCK(sse2);
>
> - p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
> p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
> p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
> p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
> @@ -1565,9 +1561,11 @@
> p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
> p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
> p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
> +
> p.dct[DCT_4x4] = x265_dct4_sse2;
> p.idct[IDCT_4x4] = x265_idct4_sse2;
> p.idct[IDST_4x4] = x265_idst4_sse2;
> +
> p.planecopy_sp = x265_downShift_16_sse2;
> p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
> p.copy_shl[BLOCK_8x8] = x265_copy_shl_8_sse2;
> @@ -1616,7 +1614,7 @@
> LUMA_ADDAVG(_sse4);
> CHROMA_ADDAVG(_sse4);
> CHROMA_ADDAVG_422(_sse4);
> - p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
> + p.cpy16to16_shl = x265_copy16to16_shl_sse4;
> p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
> p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
> p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
> @@ -1765,11 +1763,13 @@
> p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
> p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
> p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
> +
> p.denoiseDct = x265_denoise_dct_avx2;
> p.dct[DCT_4x4] = x265_dct4_avx2;
> p.quant = x265_quant_avx2;
> p.nquant = x265_nquant_avx2;
> p.dequant_normal = x265_dequant_normal_avx2;
> +
> p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x4] = x265_blockcopy_ss_16x4_avx;
> p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x12] = x265_blockcopy_ss_16x12_avx;
> p.chroma[X265_CSP_I420].copy_ss[CHROMA_16x8] = x265_blockcopy_ss_16x8_avx;
> @@ -1785,6 +1785,7 @@
> p.weight_pp = x265_weight_pp_avx2;
>
> #if X86_64
> +
> p.dct[DCT_8x8] = x265_dct8_avx2;
> p.dct[DCT_16x16] = x265_dct16_avx2;
> p.dct[DCT_32x32] = x265_dct32_avx2;
> diff -r 2f0062f0791b -r 706fa4af912b source/common/x86/blockcopy8.asm
> --- a/source/common/x86/blockcopy8.asm Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/x86/blockcopy8.asm Tue Nov 18 14:00:27 2014 +0530
> @@ -3669,85 +3669,11 @@
> BLOCKCOPY_SS_W64_H4_avx 64, 48
> BLOCKCOPY_SS_W64_H4_avx 64, 64
>
> -;-----------------------------------------------------------------------------
> -; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
> -;-----------------------------------------------------------------------------
> -INIT_XMM sse2
> -cglobal cvt32to16_shr, 4, 7, 3, dst, src, stride
> -%define rnd m2
> -%define shift m1
> -
> - ; make shift
> - mov r5d, r3m
> - movd shift, r5d
> -
> - ; make round
> - dec r5
> - xor r6, r6
> - bts r6, r5
> -
> - movd rnd, r6d
> - pshufd rnd, rnd, 0
> -
> - ; register alloc
> - ; r0 - dst
> - ; r1 - src
> - ; r2 - stride * 2 (short*)
> - ; r3 - lx
> - ; r4 - size
> - ; r5 - ly
> - ; r6 - diff
> - add r2d, r2d
> -
> - mov r4d, r4m
> - mov r5, r4
> - mov r6, r2
> - sub r6, r4
> - add r6, r6
> -
> - shr r5, 1
> -.loop_row:
> -
> - mov r3, r4
> - shr r3, 2
> -.loop_col:
> - ; row 0
> - movu m0, [r1]
> - paddd m0, rnd
> - psrad m0, shift
> - packssdw m0, m0
> - movh [r0], m0
> -
> - ; row 1
> - movu m0, [r1 + r4 * 4]
> - paddd m0, rnd
> - psrad m0, shift
> - packssdw m0, m0
> - movh [r0 + r2], m0
> -
> - ; move col pointer
> - add r1, 16
> - add r0, 8
> -
> - dec r3
> - jg .loop_col
> -
> - ; update pointer
> - lea r1, [r1 + r4 * 4]
> - add r0, r6
> -
> - ; end of loop_row
> - dec r5
> - jg .loop_row
> -
> - RET
> -
> -
> ;--------------------------------------------------------------------------------------
> -; void cvt16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size);
> +; void copy16to16_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse4
> -cglobal cvt16to32_shl, 5, 7, 2, dst, src, stride, shift, size
> +cglobal copy16to16_shl, 5, 7, 2, dst, src, stride, shift, size
> %define shift m1
>
> ; make shift
> @@ -3764,16 +3690,16 @@
> sub r2d, r4d
> add r2d, r2d
> mov r5d, r4d
> - shr r4d, 2
> + shr r4d, 3
> .loop_row:
> mov r6d, r4d
>
> .loop_col:
> - pmovsxwd m0, [r1]
> - pslld m0, shift
> + movu m0, [r1]
> + psllw m0, shift
> movu [r0], m0
>
> - add r1, 8
> + add r1, 16
> add r0, 16
>
> dec r6d
> diff -r 2f0062f0791b -r 706fa4af912b source/common/x86/blockcopy8.h
> --- a/source/common/x86/blockcopy8.h Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/x86/blockcopy8.h Tue Nov 18 14:00:27 2014 +0530
> @@ -24,7 +24,6 @@
> #ifndef X265_BLOCKCOPY8_H
> #define X265_BLOCKCOPY8_H
>
> -void x265_cvt32to16_shr_sse2(int16_t * dst, int *src, intptr_t, int, int);
> void x265_cvt32to16_shl_4_sse2(int16_t * dst, int *src, intptr_t, int);
> void x265_cvt32to16_shl_8_sse2(int16_t * dst, int *src, intptr_t, int);
> void x265_cvt32to16_shl_16_sse2(int16_t * dst, int *src, intptr_t, int);
> @@ -33,7 +32,7 @@
> void x265_cvt32to16_shl_8_avx2(int16_t * dst, int *src, intptr_t, int);
> void x265_cvt32to16_shl_16_avx2(int16_t * dst, int *src, intptr_t, int);
> void x265_cvt32to16_shl_32_avx2(int16_t * dst, int *src, intptr_t, int);
> -void x265_cvt16to32_shl_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> +void x265_copy16to16_shl_sse4(int16_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> void x265_cvt16to32_shr_4_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> void x265_cvt16to32_shr_8_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> diff -r 2f0062f0791b -r 706fa4af912b source/common/x86/dct8.asm
> --- a/source/common/x86/dct8.asm Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/x86/dct8.asm Tue Nov 18 14:00:27 2014 +0530
> @@ -245,7 +245,7 @@
>
> avx2_idct4_2: dw 64, 64, 64, -64, 83, 36, 36, -83
>
> -const idct4_shuf1, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
> +const idct4_shuf1, times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
>
> idct4_shuf2: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11
>
> @@ -318,7 +318,7 @@
> cextern pw_ppppmmmm
>
> ;------------------------------------------------------
> -;void dct4(int16_t *src, int32_t *dst, intptr_t stride)
> +;void dct4(int16_t *src, int16_t *dst, intptr_t stride)
> ;------------------------------------------------------
> INIT_XMM sse2
> cglobal dct4, 3, 4, 8
> @@ -384,28 +384,28 @@
> paddd m1, m3
> paddd m1, m7
> psrad m1, 8
> +
> + pmaddwd m4, m2, m5
> + pmaddwd m3, m0, m5
> + psubd m4, m3
> + paddd m4, m7
> + psrad m4, 8
> + packssdw m1, m4
> movu [r1 + 0 * 16], m1
>
> - pmaddwd m1, m2, m5
> - pmaddwd m3, m0, m5
> - psubd m1, m3
> - paddd m1, m7
> - psrad m1, 8
> - movu [r1 + 1 * 16], m1
> -
> pmaddwd m1, m2, m6
> pmaddwd m3, m0, m6
> paddd m1, m3
> paddd m1, m7
> psrad m1, 8
> - movu [r1 + 2 * 16], m1
>
> pmaddwd m2, [r3 + 3 * 16]
> pmaddwd m0, [r3 + 3 * 16]
> psubd m2, m0
> paddd m2, m7
> psrad m2, 8
> - movu [r1 + 3 * 16], m2
> + packssdw m1, m2
> + movu [r1 + 1 * 16], m1
> RET
>
> ; DCT 4x4
> @@ -470,14 +470,12 @@
> paddd m2, m7
> psrad m2, 8
>
> - movu [r1], xm3
> - movu [r1 + mmsize/2], m2
> - vextracti128 [r1 + mmsize], m3, 1
> - vextracti128 [r1 + mmsize + mmsize/2], m2, 1
> + packssdw m3, m2
> + movu [r1], m3
> RET
>
> ;-------------------------------------------------------
> -;void idct4(int32_t *src, int16_t *dst, intptr_t stride)
> +;void idct4(int16_t *src, int16_t *dst, intptr_t stride)
> ;-------------------------------------------------------
> INIT_XMM sse2
> cglobal idct4, 3, 4, 7
> @@ -497,11 +495,6 @@
>
> movu m0, [r0 + 0 * 16]
> movu m1, [r0 + 1 * 16]
> - packssdw m0, m1
> -
> - movu m1, [r0 + 2 * 16]
> - movu m2, [r0 + 3 * 16]
> - packssdw m1, m2
>
> punpcklwd m2, m0, m1
> pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1
> @@ -572,7 +565,7 @@
> RET
>
> ;------------------------------------------------------
> -;void dst4(int16_t *src, int32_t *dst, intptr_t stride)
> +;void dst4(int16_t *src, int16_t *dst, intptr_t stride)
> ;------------------------------------------------------
> INIT_XMM ssse3
> %if ARCH_X86_64
> @@ -638,33 +631,33 @@
> phaddd m0, m1
> paddd m0, m5
> psrad m0, 8
> +
> + pmaddwd m4, m2, coef1
> + pmaddwd m1, m3, coef1
> + phaddd m4, m1
> + paddd m4, m5
> + psrad m4, 8
> + packssdw m0, m4
> movu [r1 + 0 * 16], m0
>
> - pmaddwd m0, m2, coef1
> - pmaddwd m1, m3, coef1
> - phaddd m0, m1
> - paddd m0, m5
> - psrad m0, 8
> - movu [r1 + 1 * 16], m0
> -
> pmaddwd m0, m2, coef2
> pmaddwd m1, m3, coef2
> phaddd m0, m1
> paddd m0, m5
> psrad m0, 8
> - movu [r1 + 2 * 16], m0
>
> pmaddwd m2, coef3
> pmaddwd m3, coef3
> phaddd m2, m3
> paddd m2, m5
> psrad m2, 8
> - movu [r1 + 3 * 16], m2
> + packssdw m0, m2
> + movu [r1 + 1 * 16], m0
>
> RET
>
> ;-------------------------------------------------------
> -;void idst4(int32_t *src, int16_t *dst, intptr_t stride)
> +;void idst4(int16_t *src, int16_t *dst, intptr_t stride)
> ;-------------------------------------------------------
> INIT_XMM sse2
> cglobal idst4, 3, 4, 7
> @@ -683,11 +676,6 @@
>
> movu m0, [r0 + 0 * 16]
> movu m1, [r0 + 1 * 16]
> - packssdw m0, m1
> -
> - movu m1, [r0 + 2 * 16]
> - movu m2, [r0 + 3 * 16]
> - packssdw m1, m2
>
> punpcklwd m2, m0, m1 ; m2 = m128iAC
> punpckhwd m0, m1 ; m0 = m128iBD
> @@ -762,7 +750,7 @@
>
>
> ;-------------------------------------------------------
> -; void dct8(int16_t *src, int32_t *dst, intptr_t stride)
> +; void dct8(int16_t *src, int16_t *dst, intptr_t stride)
> ;-------------------------------------------------------
> INIT_XMM sse4
> cglobal dct8, 3,6,7,0-16*mmsize
> @@ -935,10 +923,16 @@
> phsubd m4, m2 ; m4 = [Row6 Row4]
> paddd m4, m6
> psrad m4, 9
> - movh [r1 + 0*2*mmsize], m3
> - movhps [r1 + 2*2*mmsize], m3
> - movh [r1 + 4*2*mmsize], m4
> - movhps [r1 + 6*2*mmsize], m4
> +
> + packssdw m3, m3
> + movd [r1 + 0*mmsize], m3
> + pshufd m3, m3, 1
> + movd [r1 + 2*mmsize], m3
> +
> + packssdw m4, m4
> + movd [r1 + 4*mmsize], m4
> + pshufd m4, m4, 1
> + movd [r1 + 6*mmsize], m4
>
> ; odd
> pmulld m2, m0, [r4 + 2*16]
> @@ -950,8 +944,11 @@
> phaddd m2, m4 ; m2 = [Row3 Row1]
> paddd m2, m6
> psrad m2, 9
> - movh [r1 + 1*2*mmsize], m2
> - movhps [r1 + 3*2*mmsize], m2
> +
> + packssdw m2, m2
> + movd [r1 + 1*mmsize], m2
> + pshufd m2, m2, 1
> + movd [r1 + 3*mmsize], m2
>
> pmulld m2, m0, [r4 + 4*16]
> pmulld m3, m1, [r4 + 4*16]
> @@ -962,10 +959,13 @@
> phaddd m2, m4 ; m2 = [Row7 Row5]
> paddd m2, m6
> psrad m2, 9
> - movh [r1 + 5*2*mmsize], m2
> - movhps [r1 + 7*2*mmsize], m2
> -
> - add r1, mmsize/2
> +
> + packssdw m2, m2
> + movd [r1 + 5*mmsize], m2
> + pshufd m2, m2, 1
> + movd [r1 + 7*mmsize], m2
> +
> + add r1, mmsize/4
> add r0, 2*2*mmsize
> %endrep
>
> @@ -974,17 +974,16 @@
> RET
>
> ;-------------------------------------------------------
> -; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
> +; void idct8(int16_t *src, int16_t *dst, intptr_t stride)
> ;-------------------------------------------------------
> INIT_XMM ssse3
>
> cglobal patial_butterfly_inverse_internal_pass1
> - movu m0, [r0]
> - movu m1, [r0 + 4 * 32]
> - movu m2, [r0 + 2 * 32]
> - movu m3, [r0 + 6 * 32]
> - packssdw m0, m2
> - packssdw m1, m3
> + movh m0, [r0]
> + movhps m0, [r0 + 2 * 16]
> + movh m1, [r0 + 4 * 16]
> + movhps m1, [r0 + 6 * 16]
> +
> punpckhwd m2, m0, m1 ; [2 6]
> punpcklwd m0, m1 ; [0 4]
> pmaddwd m1, m0, [r6] ; EE[0]
> @@ -1004,12 +1003,10 @@
> paddd m3, m5
> paddd m4, m5
>
> - movu m2, [r0 + 32]
> - movu m5, [r0 + 5 * 32]
> - packssdw m2, m5
> - movu m5, [r0 + 3 * 32]
> - movu m6, [r0 + 7 * 32]
> - packssdw m5, m6
> + movh m2, [r0 + 16]
> + movhps m2, [r0 + 5 * 16]
> + movh m5, [r0 + 3 * 16]
> + movhps m5, [r0 + 7 * 16]
> punpcklwd m6, m2, m5 ;[1 3]
> punpckhwd m2, m5 ;[5 7]
>
> @@ -1136,7 +1133,7 @@
>
> call patial_butterfly_inverse_internal_pass1
>
> - add r0, 16
> + add r0, 8
> add r5, 8
>
> call patial_butterfly_inverse_internal_pass1
> @@ -1167,53 +1164,68 @@
>
>
> ;-----------------------------------------------------------------------------
> -; void denoise_dct(int32_t *dct, uint32_t *sum, uint16_t *offset, int size)
> +; void denoise_dct(int16_t *dct, uint32_t *sum, uint16_t *offset, int size)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal denoise_dct, 4, 4, 6
> pxor m5, m5
> - shr r3d, 2
> + shr r3d, 3
> .loop:
> mova m0, [r0]
> - pabsd m1, m0
> + pabsw m1, m0
> +
> mova m2, [r1]
> - paddd m2, m1
> + pmovsxwd m3, m1
> + paddd m2, m3
> mova [r1], m2
> - pmovzxwd m3, [r2]
> - psubd m1, m3
> - pcmpgtd m4, m1, m5
> + mova m2, [r1 + 16]
> + psrldq m3, m1, 8
> + pmovsxwd m4, m3
> + paddd m2, m4
> + mova [r1 + 16], m2
> +
> + movu m3, [r2]
> + psubsw m1, m3
> + pcmpgtw m4, m1, m5
> pand m1, m4
> - psignd m1, m0
> + psignw m1, m0
> mova [r0], m1
> add r0, 16
> - add r1, 16
> - add r2, 8
> - dec r3d
> - jnz .loop
> - RET
> -
> -INIT_YMM avx2
> -cglobal denoise_dct, 4, 4, 6
> - pxor m5, m5
> - shr r3d, 3
> -.loop:
> - movu m0, [r0]
> - pabsd m1, m0
> - movu m2, [r1]
> - paddd m2, m1
> - movu [r1], m2
> - pmovzxwd m3, [r2]
> - psubd m1, m3
> - pcmpgtd m4, m1, m5
> - pand m1, m4
> - psignd m1, m0
> - movu [r0], m1
> - add r0, 32
> add r1, 32
> add r2, 16
> dec r3d
> jnz .loop
> RET
> +
> +INIT_YMM avx2
> +cglobal denoise_dct, 4, 4, 6
> + pxor m5, m5
> + shr r3d, 4
> +.loop:
> + movu m0, [r0]
> + pabsw m1, m0
> + movu m2, [r1]
> + pmovsxwd m4, xm1
> + paddd m2, m4
> + movu [r1], m2
> + vextracti128 xm4, m1, 1
> + movu m2, [r1 + 32]
> + pmovsxwd m3, xm4
> + paddd m2, m3
> + movu [r1 + 32], m2
> + movu m3, [r2]
> + psubw m1, m3
> + pcmpgtw m4, m1, m5
> + pand m1, m4
> + psignw m1, m0
> + movu [r0], m1
> + add r0, 32
> + add r1, 64
> + add r2, 32
> + dec r3d
> + jnz .loop
> + RET
> +
> %if ARCH_X86_64 == 1
> %macro DCT8_PASS_1 4
> vpbroadcastq m0, [r6 + %1]
> @@ -1227,7 +1239,7 @@
> mova [r5 + %2], xm2
> %endmacro
>
> -%macro DCT8_PASS_2 1
> +%macro DCT8_PASS_2 2
> vbroadcasti128 m4, [r6 + %1]
> pmaddwd m6, m0, m4
> pmaddwd m7, m1, m4
> @@ -1238,10 +1250,25 @@
> phaddd m6, m8
> paddd m6, m5
> psrad m6, DCT_SHIFT2
> +
> + vbroadcasti128 m4, [r6 + %2]
> + pmaddwd m10, m0, m4
> + pmaddwd m7, m1, m4
> + pmaddwd m8, m2, m4
> + pmaddwd m9, m3, m4
> + phaddd m10, m7
> + phaddd m8, m9
> + phaddd m10, m8
> + paddd m10, m5
> + psrad m10, DCT_SHIFT2
> +
> + packssdw m6, m10
> + vpermq m10, m6, 0xD8
> +
> %endmacro
>
> INIT_YMM avx2
> -cglobal dct8, 3, 7, 10, 0-8*16
> +cglobal dct8, 3, 7, 11, 0-8*16
> %if BIT_DEPTH == 10
> %define DCT_SHIFT 4
> vbroadcasti128 m5, [pd_8]
> @@ -1294,9 +1321,6 @@
> DCT8_PASS_1 7 * 16, 7 * 16, 4, 1
>
> ;pass2
> - mov r2d, 32
> - lea r3, [r2 * 3]
> - lea r4, [r1 + r2 * 4]
> vbroadcasti128 m5, [pd_256]
>
> mova m0, [r5]
> @@ -1304,22 +1328,14 @@
> mova m2, [r5 + 64]
> mova m3, [r5 + 96]
>
> - DCT8_PASS_2 0 * 16
> - movu [r1], m6
> - DCT8_PASS_2 1 * 16
> - movu [r1 + r2], m6
> - DCT8_PASS_2 2 * 16
> - movu [r1 + r2 * 2], m6
> - DCT8_PASS_2 3 * 16
> - movu [r1 + r3], m6
> - DCT8_PASS_2 4 * 16
> - movu [r4], m6
> - DCT8_PASS_2 5 * 16
> - movu [r4 + r2], m6
> - DCT8_PASS_2 6 * 16
> - movu [r4 + r2 * 2], m6
> - DCT8_PASS_2 7 * 16
> - movu [r4 + r3], m6
> + DCT8_PASS_2 0 * 16, 1 * 16
> + movu [r1], m10
> + DCT8_PASS_2 2 * 16, 3 * 16
> + movu [r1 + 32], m10
> + DCT8_PASS_2 4 * 16, 5 * 16
> + movu [r1 + 64], m10
> + DCT8_PASS_2 6 * 16, 7 * 16
> + movu [r1 + 96], m10
> RET
>
> %macro DCT16_PASS_1_E 2
> @@ -1360,7 +1376,7 @@
> mova [r5 + %2], xm10
> %endmacro
>
> -%macro DCT16_PASS_2 1
> +%macro DCT16_PASS_2 2
> vbroadcasti128 m8, [r7 + %1]
> vbroadcasti128 m13, [r8 + %1]
>
> @@ -1385,9 +1401,40 @@
> phaddd m10, m11
> paddd m10, m9
> psrad m10, DCT_SHIFT2
> +
> +
> + vbroadcasti128 m8, [r7 + %2]
> + vbroadcasti128 m13, [r8 + %2]
> +
> + pmaddwd m14, m0, m8
> + pmaddwd m11, m1, m13
> + paddd m14, m11
> +
> + pmaddwd m11, m2, m8
> + pmaddwd m12, m3, m13
> + paddd m11, m12
> + phaddd m14, m11
> +
> + pmaddwd m11, m4, m8
> + pmaddwd m12, m5, m13
> + paddd m11, m12
> +
> + pmaddwd m12, m6, m8
> + pmaddwd m13, m7, m13
> + paddd m12, m13
> + phaddd m11, m12
> +
> + phaddd m14, m11
> + paddd m14, m9
> + psrad m14, DCT_SHIFT2
> +
> + packssdw m10, m14
> + vextracti128 xm14, m10, 1
> + movlhps xm15, xm10, xm14
> + movhlps xm14, xm10
> %endmacro
> INIT_YMM avx2
> -cglobal dct16, 3, 9, 15, 0-16*mmsize
> +cglobal dct16, 3, 9, 16, 0-16*mmsize
> %if BIT_DEPTH == 10
> %define DCT_SHIFT 5
> vbroadcasti128 m9, [pd_16]
> @@ -1487,7 +1534,7 @@
>
> mov r5, rsp
> mov r4d, 2
> - mov r2d, 64
> + mov r2d, 32
> lea r3, [r2 * 3]
> vbroadcasti128 m9, [pd_512]
>
> @@ -1504,46 +1551,42 @@
> mova m6, [r5 + 3 * 32] ; [row3lo row7lo]
> mova m7, [r5 + 11 * 32] ; [row3hi row7hi]
>
> - DCT16_PASS_2 -8 * 16
> - movu [r1], m10
> - DCT16_PASS_2 -7 * 16
> - movu [r1 + r2], m10
> - DCT16_PASS_2 -6 * 16
> - movu [r1 + r2 * 2], m10
> - DCT16_PASS_2 -5 * 16
> - movu [r1 + r3], m10
> + DCT16_PASS_2 -8 * 16, -7 * 16
> + movu [r1], xm15
> + movu [r1 + r2], xm14
> +
> + DCT16_PASS_2 -6 * 16, -5 * 16
> + movu [r1 + r2 * 2], xm15
> + movu [r1 + r3], xm14
>
> lea r6, [r1 + r2 * 4]
> - DCT16_PASS_2 -4 * 16
> - movu [r6], m10
> - DCT16_PASS_2 -3 * 16
> - movu [r6 + r2], m10
> - DCT16_PASS_2 -2 * 16
> - movu [r6 + r2 * 2], m10
> - DCT16_PASS_2 -1 * 16
> - movu [r6 + r3], m10
> + DCT16_PASS_2 -4 * 16, -3 * 16
> + movu [r6], xm15
> + movu [r6 + r2], xm14
> +
> + DCT16_PASS_2 -2 * 16, -1 * 16
> + movu [r6 + r2 * 2], xm15
> + movu [r6 + r3], xm14
>
> lea r6, [r6 + r2 * 4]
> - DCT16_PASS_2 0 * 16
> - movu [r6], m10
> - DCT16_PASS_2 1 * 16
> - movu [r6 + r2], m10
> - DCT16_PASS_2 2 * 16
> - movu [r6 + r2 * 2], m10
> - DCT16_PASS_2 3 * 16
> - movu [r6 + r3], m10
> + DCT16_PASS_2 0 * 16, 1 * 16
> + movu [r6], xm15
> + movu [r6 + r2], xm14
> +
> + DCT16_PASS_2 2 * 16, 3 * 16
> + movu [r6 + r2 * 2], xm15
> + movu [r6 + r3], xm14
>
> lea r6, [r6 + r2 * 4]
> - DCT16_PASS_2 4 * 16
> - movu [r6], m10
> - DCT16_PASS_2 5 * 16
> - movu [r6 + r2], m10
> - DCT16_PASS_2 6 * 16
> - movu [r6 + r2 * 2], m10
> - DCT16_PASS_2 7 * 16
> - movu [r6 + r3], m10
> -
> - add r1, 32
> + DCT16_PASS_2 4 * 16, 5 * 16
> + movu [r6], xm15
> + movu [r6 + r2], xm14
> +
> + DCT16_PASS_2 6 * 16, 7 * 16
> + movu [r6 + r2 * 2], xm15
> + movu [r6 + r3], xm14
> +
> + add r1, 16
> add r5, 128
>
> dec r4d
> @@ -1609,6 +1652,7 @@
>
> paddd xm11, xm9
> psrad xm11, DCT_SHIFT2
> + packssdw xm11, xm11
>
> %endmacro
>
> @@ -1704,7 +1748,7 @@
> dec r4d
> jnz .pass1
>
> - mov r2d, 128
> + mov r2d, 64
> lea r3, [r2 * 3]
> mov r5, rsp
> mov r4d, 8
> @@ -1724,86 +1768,86 @@
> mova m7, [r5 + 3 * 64 + 32]
>
> DCT32_PASS_2 0 * 32
> - movu [r1], xm11
> + movq [r1], xm11
> DCT32_PASS_2 1 * 32
> - movu [r1 + r2], xm11
> + movq [r1 + r2], xm11
> DCT32_PASS_2 2 * 32
> - movu [r1 + r2 * 2], xm11
> + movq [r1 + r2 * 2], xm11
> DCT32_PASS_2 3 * 32
> - movu [r1 + r3], xm11
> + movq [r1 + r3], xm11
>
> lea r6, [r1 + r2 * 4]
> DCT32_PASS_2 4 * 32
> - movu [r6], xm11
> + movq [r6], xm11
> DCT32_PASS_2 5 * 32
> - movu [r6 + r2], xm11
> + movq [r6 + r2], xm11
> DCT32_PASS_2 6 * 32
> - movu [r6 + r2 * 2], xm11
> + movq [r6 + r2 * 2], xm11
> DCT32_PASS_2 7 * 32
> - movu [r6 + r3], xm11
> + movq [r6 + r3], xm11
>
> lea r6, [r6 + r2 * 4]
> DCT32_PASS_2 8 * 32
> - movu [r6], xm11
> + movq [r6], xm11
> DCT32_PASS_2 9 * 32
> - movu [r6 + r2], xm11
> + movq [r6 + r2], xm11
> DCT32_PASS_2 10 * 32
> - movu [r6 + r2 * 2], xm11
> + movq [r6 + r2 * 2], xm11
> DCT32_PASS_2 11 * 32
> - movu [r6 + r3], xm11
> + movq [r6 + r3], xm11
>
> lea r6, [r6 + r2 * 4]
> DCT32_PASS_2 12 * 32
> - movu [r6], xm11
> + movq [r6], xm11
> DCT32_PASS_2 13 * 32
> - movu [r6 + r2], xm11
> + movq [r6 + r2], xm11
> DCT32_PASS_2 14 * 32
> - movu [r6 + r2 * 2], xm11
> + movq [r6 + r2 * 2], xm11
> DCT32_PASS_2 15 * 32
> - movu [r6 + r3], xm11
> + movq [r6 + r3], xm11
>
> lea r6, [r6 + r2 * 4]
> DCT32_PASS_2 16 * 32
> - movu [r6], xm11
> + movq [r6], xm11
> DCT32_PASS_2 17 * 32
> - movu [r6 + r2], xm11
> + movq [r6 + r2], xm11
> DCT32_PASS_2 18 * 32
> - movu [r6 + r2 * 2], xm11
> + movq [r6 + r2 * 2], xm11
> DCT32_PASS_2 19 * 32
> - movu [r6 + r3], xm11
> + movq [r6 + r3], xm11
>
> lea r6, [r6 + r2 * 4]
> DCT32_PASS_2 20 * 32
> - movu [r6], xm11
> + movq [r6], xm11
> DCT32_PASS_2 21 * 32
> - movu [r6 + r2], xm11
> + movq [r6 + r2], xm11
> DCT32_PASS_2 22 * 32
> - movu [r6 + r2 * 2], xm11
> + movq [r6 + r2 * 2], xm11
> DCT32_PASS_2 23 * 32
> - movu [r6 + r3], xm11
> + movq [r6 + r3], xm11
>
> lea r6, [r6 + r2 * 4]
> DCT32_PASS_2 24 * 32
> - movu [r6], xm11
> + movq [r6], xm11
> DCT32_PASS_2 25 * 32
> - movu [r6 + r2], xm11
> + movq [r6 + r2], xm11
> DCT32_PASS_2 26 * 32
> - movu [r6 + r2 * 2], xm11
> + movq [r6 + r2 * 2], xm11
> DCT32_PASS_2 27 * 32
> - movu [r6 + r3], xm11
> + movq [r6 + r3], xm11
>
> lea r6, [r6 + r2 * 4]
> DCT32_PASS_2 28 * 32
> - movu [r6], xm11
> + movq [r6], xm11
> DCT32_PASS_2 29 * 32
> - movu [r6 + r2], xm11
> + movq [r6 + r2], xm11
> DCT32_PASS_2 30 * 32
> - movu [r6 + r2 * 2], xm11
> + movq [r6 + r2 * 2], xm11
> DCT32_PASS_2 31 * 32
> - movu [r6 + r3], xm11
> + movq [r6 + r3], xm11
>
> add r5, 256
> - add r1, 16
> + add r1, 8
>
> dec r4d
> jnz .pass2
> @@ -1926,28 +1970,25 @@
> lea r6, [avx2_idct8_2]
>
> ;pass1
> - mova m0, [r0 + 0 * 32]
> - mova m1, [r0 + 4 * 32]
> - packssdw m0, m1 ; [0 0 0 0 4 4 4 4 0 0 0 0 4 4 4 4]
> - mova m1, [r0 + 2 * 32]
> - mova m2, [r0 + 6 * 32]
> - packssdw m1, m2 ; [2 2 2 2 6 6 6 6 2 2 2 2 6 6 6 6]
> - mova m2, [r0 + 1 * 32]
> - mova m3, [r0 + 5 * 32]
> - packssdw m2, m3 ; [1 1 1 1 5 5 5 5 1 1 1 1 5 5 5 5]
> - mova m3, [r0 + 3 * 32]
> - mova m4, [r0 + 7 * 32]
> - packssdw m3, m4 ; [3 3 3 3 7 7 7 7 3 3 3 3 7 7 7 7]
> + mova m1, [r0 + 0 * 32] ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
> + mova m0, [r0 + 1 * 32] ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3]
> + vpunpcklwd m5, m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
> + vpunpckhwd m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
> + vinserti128 m4, m5, xm1, 1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
> + vextracti128 xm2, m5, 1 ; [1 3 1 3 1 3 1 3]
> + vinserti128 m1, m1, xm2, 0 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
> +
> + mova m2, [r0 + 2 * 32] ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5]
> + mova m0, [r0 + 3 * 32] ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7]
> + vpunpcklwd m5, m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
> + vpunpckhwd m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
> + vinserti128 m0, m5, xm2, 1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
> + vextracti128 xm5, m5, 1 ; [5 7 5 7 5 7 5 7]
> + vinserti128 m2, m2, xm5, 0 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
>
> mova m5, [idct8_shuf1]
> -
> - punpcklwd m4, m0, m1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
> - punpckhwd m0, m1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
> vpermd m4, m5, m4
> vpermd m0, m5, m0
> -
> - punpcklwd m1, m2, m3 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]
> - punpckhwd m2, m3 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]
> vpermd m1, m5, m1
> vpermd m2, m5, m2
>
> @@ -2065,7 +2106,7 @@
> %endmacro
>
> ;-------------------------------------------------------
> -; void idct16(int32_t *src, int16_t *dst, intptr_t stride)
> +; void idct16(int16_t *src, int16_t *dst, intptr_t stride)
> ;-------------------------------------------------------
> INIT_YMM avx2
> cglobal idct16, 3, 7, 16, 0-16*mmsize
> @@ -2087,37 +2128,53 @@
> mov r4d, 2
>
> .pass1:
> - movu m0, [r0 + 0 * 64]
> - movu m1, [r0 + 8 * 64]
> - packssdw m0, m1 ;[0L 8L 0H 8H]
> -
> - movu m1, [r0 + 1 * 64]
> - movu m2, [r0 + 9 * 64]
> - packssdw m1, m2 ;[1L 9L 1H 9H]
> -
> - movu m2, [r0 + 2 * 64]
> - movu m3, [r0 + 10 * 64]
> - packssdw m2, m3 ;[2L 10L 2H 10H]
> -
> - movu m3, [r0 + 3 * 64]
> - movu m4, [r0 + 11 * 64]
> - packssdw m3, m4 ;[3L 11L 3H 11H]
> -
> - movu m4, [r0 + 4 * 64]
> - movu m5, [r0 + 12 * 64]
> - packssdw m4, m5 ;[4L 12L 4H 12H]
> -
> - movu m5, [r0 + 5 * 64]
> - movu m6, [r0 + 13 * 64]
> - packssdw m5, m6 ;[5L 13L 5H 13H]
> -
> - movu m6, [r0 + 6 * 64]
> - movu m7, [r0 + 14 * 64]
> - packssdw m6, m7 ;[6L 14L 6H 14H]
> -
> - movu m7, [r0 + 7 * 64]
> - movu m8, [r0 + 15 * 64]
> - packssdw m7, m8 ;[7L 15L 7H 15H]
> + movu xm0, [r0 + 0 * 32]
> + movu xm1, [r0 + 8 * 32]
> + punpckhqdq xm2, xm0, xm1
> + punpcklqdq xm0, xm1
> + vinserti128 m0, m0, xm2, 1
> +
> + movu xm1, [r0 + 1 * 32]
> + movu xm2, [r0 + 9 * 32]
> + punpckhqdq xm3, xm1, xm2
> + punpcklqdq xm1, xm2
> + vinserti128 m1, m1, xm3, 1
> +
> + movu xm2, [r0 + 2 * 32]
> + movu xm3, [r0 + 10 * 32]
> + punpckhqdq xm4, xm2, xm3
> + punpcklqdq xm2, xm3
> + vinserti128 m2, m2, xm4, 1
> +
> + movu xm3, [r0 + 3 * 32]
> + movu xm4, [r0 + 11 * 32]
> + punpckhqdq xm5, xm3, xm4
> + punpcklqdq xm3, xm4
> + vinserti128 m3, m3, xm5, 1
> +
> + movu xm4, [r0 + 4 * 32]
> + movu xm5, [r0 + 12 * 32]
> + punpckhqdq xm6, xm4, xm5
> + punpcklqdq xm4, xm5
> + vinserti128 m4, m4, xm6, 1
> +
> + movu xm5, [r0 + 5 * 32]
> + movu xm6, [r0 + 13 * 32]
> + punpckhqdq xm7, xm5, xm6
> + punpcklqdq xm5, xm6
> + vinserti128 m5, m5, xm7, 1
> +
> + movu xm6, [r0 + 6 * 32]
> + movu xm7, [r0 + 14 * 32]
> + punpckhqdq xm8, xm6, xm7
> + punpcklqdq xm6, xm7
> + vinserti128 m6, m6, xm8, 1
> +
> + movu xm7, [r0 + 7 * 32]
> + movu xm8, [r0 + 15 * 32]
> + punpckhqdq xm9, xm7, xm8
> + punpcklqdq xm7, xm8
> + vinserti128 m7, m7, xm9, 1
>
> punpckhwd m8, m0, m2 ;[8 10]
> punpcklwd m0, m2 ;[0 2]
> @@ -2160,7 +2217,7 @@
> IDCT_PASS1 4, 10
> IDCT_PASS1 6, 8
>
> - add r0, 32
> + add r0, 16
> add r3, 16
> dec r4d
> jnz .pass1
> @@ -2328,7 +2385,7 @@
> %endmacro
>
> ;-------------------------------------------------------
> -; void idct32(int32_t *src, int16_t *dst, intptr_t stride)
> +; void idct32(int16_t *src, int16_t *dst, intptr_t stride)
> ;-------------------------------------------------------
>
> ; TODO: Reduce PHADDD instruction by PADDD
> @@ -2345,54 +2402,69 @@
> mov r5d, 8
>
> .pass1:
> - movu xm0, [r0 + 2 * 128]
> - movu xm1, [r0 + 18 * 128]
> - vinserti128 m0, m0, [r0 + 0 * 128], 1
> - vinserti128 m1, m1, [r0 + 16 * 128], 1
> -
> - packssdw m0, m1 ;[2 18 0 16]
> -
> - movu xm1, [r0 + 1 * 128]
> - movu xm2, [r0 + 9 * 128]
> - vinserti128 m1, m1, [r0 + 17 * 128], 1
> - vinserti128 m2, m2, [r0 + 25 * 128], 1
> - packssdw m1, m2 ;[1 9 17 25]
> -
> - movu xm2, [r0 + 6 * 128]
> - movu xm3, [r0 + 22 * 128]
> - vinserti128 m2, m2, [r0 + 4 * 128], 1
> - vinserti128 m3, m3, [r0 + 20 * 128], 1
> - packssdw m2, m3 ;[6 22 4 20]
> -
> - movu xm3, [r0 + 3 * 128]
> - movu xm4, [r0 + 11 * 128]
> - vinserti128 m3, m3, [r0 + 19 * 128], 1
> - vinserti128 m4, m4, [r0 + 27 * 128], 1
> - packssdw m3, m4 ;[3 11 19 27]
> -
> - movu xm4, [r0 + 10 * 128]
> - movu xm5, [r0 + 26 * 128]
> - vinserti128 m4, m4, [r0 + 8 * 128], 1
> - vinserti128 m5, m5, [r0 + 24 * 128], 1
> - packssdw m4, m5 ;[10 26 8 24]
> -
> - movu xm5, [r0 + 5 * 128]
> - movu xm6, [r0 + 13 * 128]
> - vinserti128 m5, m5, [r0 + 21 * 128], 1
> - vinserti128 m6, m6, [r0 + 29 * 128], 1
> - packssdw m5, m6 ;[5 13 21 29]
> -
> - movu xm6, [r0 + 14 * 128]
> - movu xm7, [r0 + 30 * 128]
> - vinserti128 m6, m6, [r0 + 12 * 128], 1
> - vinserti128 m7, m7, [r0 + 28 * 128], 1
> - packssdw m6, m7 ;[14 30 12 28]
> -
> - movu xm7, [r0 + 7 * 128]
> - movu xm8, [r0 + 15 * 128]
> - vinserti128 m7, m7, [r0 + 23 * 128], 1
> - vinserti128 m8, m8, [r0 + 31 * 128], 1
> - packssdw m7, m8 ;[7 15 23 31]
> + movq xm0, [r0 + 2 * 64]
> + movq xm1, [r0 + 18 * 64]
> + punpcklqdq xm0, xm0, xm1
> + movq xm1, [r0 + 0 * 64]
> + movq xm2, [r0 + 16 * 64]
> + punpcklqdq xm1, xm1, xm2
> + vinserti128 m0, m0, xm1, 1 ;[2 18 0 16]
> +
> + movq xm1, [r0 + 1 * 64]
> + movq xm2, [r0 + 9 * 64]
> + punpcklqdq xm1, xm1, xm2
> + movq xm2, [r0 + 17 * 64]
> + movq xm3, [r0 + 25 * 64]
> + punpcklqdq xm2, xm2, xm3
> + vinserti128 m1, m1, xm2, 1 ;[1 9 17 25]
> +
> + movq xm2, [r0 + 6 * 64]
> + movq xm3, [r0 + 22 * 64]
> + punpcklqdq xm2, xm2, xm3
> + movq xm3, [r0 + 4 * 64]
> + movq xm4, [r0 + 20 * 64]
> + punpcklqdq xm3, xm3, xm4
> + vinserti128 m2, m2, xm3, 1 ;[6 22 4 20]
> +
> + movq xm3, [r0 + 3 * 64]
> + movq xm4, [r0 + 11 * 64]
> + punpcklqdq xm3, xm3, xm4
> + movq xm4, [r0 + 19 * 64]
> + movq xm5, [r0 + 27 * 64]
> + punpcklqdq xm4, xm4, xm5
> + vinserti128 m3, m3, xm4, 1 ;[3 11 17 25]
> +
> + movq xm4, [r0 + 10 * 64]
> + movq xm5, [r0 + 26 * 64]
> + punpcklqdq xm4, xm4, xm5
> + movq xm5, [r0 + 8 * 64]
> + movq xm6, [r0 + 24 * 64]
> + punpcklqdq xm5, xm5, xm6
> + vinserti128 m4, m4, xm5, 1 ;[10 26 8 24]
> +
> + movq xm5, [r0 + 5 * 64]
> + movq xm6, [r0 + 13 * 64]
> + punpcklqdq xm5, xm5, xm6
> + movq xm6, [r0 + 21 * 64]
> + movq xm7, [r0 + 29 * 64]
> + punpcklqdq xm6, xm6, xm7
> + vinserti128 m5, m5, xm6, 1 ;[5 13 21 9]
> +
> + movq xm6, [r0 + 14 * 64]
> + movq xm7, [r0 + 30 * 64]
> + punpcklqdq xm6, xm6, xm7
> + movq xm7, [r0 + 12 * 64]
> + movq xm8, [r0 + 28 * 64]
> + punpcklqdq xm7, xm7, xm8
> + vinserti128 m6, m6, xm7, 1 ;[14 30 12 28]
> +
> + movq xm7, [r0 + 7 * 64]
> + movq xm8, [r0 + 15 * 64]
> + punpcklqdq xm7, xm7, xm8
> + movq xm8, [r0 + 23 * 64]
> + movq xm9, [r0 + 31 * 64]
> + punpcklqdq xm8, xm8, xm9
> + vinserti128 m7, m7, xm8, 1 ;[7 15 23 31]
>
> punpckhwd m8, m0, m2 ;[18 22 16 20]
> punpcklwd m0, m2 ;[2 6 0 4]
> @@ -2451,7 +2523,7 @@
> IDCT32_PASS1 6
> IDCT32_PASS1 7
>
> - add r0, 16
> + add r0, 8
> add r3, 4
> add r4, 4
> dec r5d
> @@ -2612,7 +2684,7 @@
> RET
>
> ;-------------------------------------------------------
> -; void idct4(int32_t *src, int16_t *dst, intptr_t stride)
> +; void idct4(int16_t *src, int16_t *dst, intptr_t stride)
> ;-------------------------------------------------------
> INIT_YMM avx2
> cglobal idct4, 3, 4, 6
> @@ -2632,13 +2704,14 @@
> add r2d, r2d
> lea r3, [r2 * 3]
>
> - movu m0, [r0] ;[00 01 02 03 10 11 12 13]
> - movu m1, [r0 + 32] ;[20 21 22 23 30 31 32 33]
> -
> - packssdw m0, m1 ;[00 01 02 03 20 21 22 23 10 11 12 13 30 31 32 33]
> - pshufb m0, [idct4_shuf1] ;[00 20 02 22 01 21 03 23 10 30 12 32 11 31 13 33]
> - vpermq m2, m0, 0x44 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
> - vpermq m0, m0, 0xEE ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
> + movu m0, [r0] ;[00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33]
> +
> + pshufb m0, [idct4_shuf1] ;[00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33]
> + vextracti128 xm1, m0, 1 ;[20 22 21 23 30 32 31 33]
> + punpcklwd xm2, xm0, xm1 ;[00 20 02 22 01 21 03 23]
> + punpckhwd xm0, xm1 ;[10 30 12 32 11 31 13 33]
> + vinserti128 m2, m2, xm2, 1 ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
> + vinserti128 m0, m0, xm0, 1 ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
>
> mova m1, [avx2_idct4_1]
> mova m3, [avx2_idct4_1 + 32]
> diff -r 2f0062f0791b -r 706fa4af912b source/common/x86/dct8.h
> --- a/source/common/x86/dct8.h Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/x86/dct8.h Tue Nov 18 14:00:27 2014 +0530
> @@ -23,23 +23,23 @@
>
> #ifndef X265_DCT8_H
> #define X265_DCT8_H
> -void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
> -void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
> -void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
> -void x265_dct4_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> -void x265_dct8_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> -void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> -void x265_dct32_avx2(int16_t *src, int32_t *dst, intptr_t stride);
> -void x265_idct32_avx2(int32_t *src, int16_t *dst, intptr_t stride);
> +void x265_dct4_sse2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_dst4_ssse3(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_dct8_sse4(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_dct4_avx2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_dct8_avx2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_dct16_avx2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_dct32_avx2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_idct32_avx2(int16_t *src, int16_t *dst, intptr_t stride);
>
> -void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
> -void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
> -void x265_idct4_avx2(int32_t *src, int16_t *dst, intptr_t stride);
> -void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
> -void x265_idct8_avx2(int32_t *src, int16_t *dst, intptr_t stride);
> -void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride);
> +void x265_idst4_sse2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_idct4_sse2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_idct4_avx2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_idct8_ssse3(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_idct8_avx2(int16_t *src, int16_t *dst, intptr_t stride);
> +void x265_idct16_avx2(int16_t *src, int16_t *dst, intptr_t stride);
>
> -void x265_denoise_dct_sse4(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> -void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
> +void x265_denoise_dct_sse4(int16_t *dct, uint32_t *sum, uint16_t *offset, int size);
> +void x265_denoise_dct_avx2(int16_t *dct, uint32_t *sum, uint16_t *offset, int size);
>
> #endif // ifndef X265_DCT8_H
> diff -r 2f0062f0791b -r 706fa4af912b source/common/x86/pixel-util.h
> --- a/source/common/x86/pixel-util.h Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/x86/pixel-util.h Tue Nov 18 14:00:27 2014 +0530
> @@ -42,12 +42,12 @@
> void x265_transpose32_avx2(pixel *dest, pixel *src, intptr_t stride);
> void x265_transpose64_avx2(pixel *dest, pixel *src, intptr_t stride);
>
> -uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> -uint32_t x265_quant_avx2(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> -uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> -uint32_t x265_nquant_avx2(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> -void x265_dequant_normal_sse4(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
> -void x265_dequant_normal_avx2(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
> +uint32_t x265_quant_sse4(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> +uint32_t x265_quant_avx2(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> +uint32_t x265_nquant_sse4(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> +uint32_t x265_nquant_avx2(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> +void x265_dequant_normal_sse4(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
> +void x265_dequant_normal_avx2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
> int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
>
> void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
> diff -r 2f0062f0791b -r 706fa4af912b source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/common/x86/pixel-util8.asm Tue Nov 18 14:00:27 2014 +0530
> @@ -420,7 +420,7 @@
>
>
> ;-----------------------------------------------------------------------------
> -; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> +; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal quant, 5,6,8
> @@ -442,7 +442,7 @@
> pxor m7, m7 ; m7 = numZero
> .loop:
> ; 4 coeff
> - movu m0, [r0] ; m0 = level
> + pmovsxwd m0, [r0] ; m0 = level
> pabsd m1, m0
> pmulld m1, [r1] ; m0 = tmpLevel1
> paddd m2, m1, m5
> @@ -460,7 +460,7 @@
> movh [r3], m3
>
> ; 4 coeff
> - movu m0, [r0 + 16] ; m0 = level
> + pmovsxwd m0, [r0 + 8] ; m0 = level
> pabsd m1, m0
> pmulld m1, [r1 + 16] ; m0 = tmpLevel1
> paddd m2, m1, m5
> @@ -475,7 +475,7 @@
> packssdw m3, m3
> movh [r3 + 8], m3
>
> - add r0, 32
> + add r0, 16
> add r1, 32
> add r2, 32
> add r3, 16
> @@ -512,7 +512,7 @@
> pxor m7, m7 ; m7 = numZero
> .loop:
> ; 8 coeff
> - movu m0, [r0] ; m0 = level
> + pmovsxwd m0, [r0] ; m0 = level
> pabsd m1, m0
> pmulld m1, [r1] ; m0 = tmpLevel1
> paddd m2, m1, m5
> @@ -525,7 +525,7 @@
> psignd m2, m0
>
> ; 8 coeff
> - movu m0, [r0 + mmsize] ; m0 = level
> + pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
> pabsd m1, m0
> pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
> paddd m3, m1, m5
> @@ -546,7 +546,7 @@
> pminuw m2, m9
> paddw m7, m2
>
> - add r0, mmsize*2
> + add r0, mmsize
> add r1, mmsize*2
> add r2, mmsize*2
> add r3, mmsize
> @@ -584,7 +584,7 @@
> pxor m7, m7 ; m7 = numZero
> .loop:
> ; 8 coeff
> - movu m0, [r0] ; m0 = level
> + pmovsxwd m0, [r0] ; m0 = level
> pabsd m1, m0
> pmulld m1, [r1] ; m0 = tmpLevel1
> paddd m2, m1, m5
> @@ -603,7 +603,7 @@
> movu [r3], xm3
>
> ; 8 coeff
> - movu m0, [r0 + mmsize] ; m0 = level
> + pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
> pabsd m1, m0
> pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
> paddd m2, m1, m5
> @@ -621,7 +621,7 @@
> vpermq m3, m3, q0020
> movu [r3 + mmsize/2], xm3
>
> - add r0, mmsize*2
> + add r0, mmsize
> add r1, mmsize*2
> add r2, mmsize*2
> add r3, mmsize
> @@ -642,7 +642,7 @@
>
>
> ;-----------------------------------------------------------------------------
> -; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> +; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal nquant, 3,5,8
> @@ -655,8 +655,8 @@
> shr r4d, 3
>
> .loop:
> - movu m0, [r0] ; m0 = level
> - movu m1, [r0 + 16] ; m1 = level
> + pmovsxwd m0, [r0] ; m0 = level
> + pmovsxwd m1, [r0 + 8] ; m1 = level
>
> pabsd m2, m0
> pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff
> @@ -673,7 +673,7 @@
> packssdw m2, m3
>
> movu [r2], m2
> - add r0, 32
> + add r0, 16
> add r1, 32
> add r2, 16
>
> @@ -703,14 +703,14 @@
> shr r4d, 4
>
> .loop:
> - movu m0, [r0] ; m0 = level
> + pmovsxwd m0, [r0] ; m0 = level
> pabsd m1, m0
> pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff
> paddd m1, m4
> psrad m1, xm3 ; m0 = level1
> psignd m1, m0
>
> - movu m0, [r0 + mmsize] ; m0 = level
> + pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
> pabsd m2, m0
> pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff
> paddd m2, m4
> @@ -721,7 +721,7 @@
> vpermq m2, m1, q3120
>
> movu [r2], m2
> - add r0, mmsize * 2
> + add r0, mmsize
> add r1, mmsize * 2
> add r2, mmsize
>
> @@ -770,15 +770,11 @@
> pmaddwd m4, m1
> psrad m3, m0
> psrad m4, m0
> - packssdw m3, m3 ; OPT_ME: store must be 32 bits
> - pmovsxwd m3, m3
> - packssdw m4, m4
> - pmovsxwd m4, m4
> + packssdw m3, m4
> mova [r1], m3
> - mova [r1 + 16], m4
>
> add r0, 16
> - add r1, 32
> + add r1, 16
>
> sub r2d, 8
> jnz .loop
> @@ -818,13 +814,12 @@
> pmaxsd m3, m6
> pminsd m4, m5
> pmaxsd m4, m6
> + packssdw m3, m4
> mova [r1 + 0 * mmsize/2], xm3
> - mova [r1 + 1 * mmsize/2], xm4
> - vextracti128 [r1 + 2 * mmsize/2], m3, 1
> - vextracti128 [r1 + 3 * mmsize/2], m4, 1
> + vextracti128 [r1 + 1 * mmsize/2], m3, 1
>
> add r0, mmsize
> - add r1, mmsize * 2
> + add r1, mmsize
>
> dec r2d
> jnz .loop
> diff -r 2f0062f0791b -r 706fa4af912b source/test/mbdstharness.cpp
> --- a/source/test/mbdstharness.cpp Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/test/mbdstharness.cpp Tue Nov 18 14:00:27 2014 +0530
> @@ -65,17 +65,17 @@
> short_test_buff[0][i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
> int_test_buff[0][i] = rand() % PIXEL_MAX;
> int_idct_test_buff[0][i] = (rand() % (SHORT_MAX - SHORT_MIN)) - SHORT_MAX;
> - int_denoise_test_buff1[0][i] = int_denoise_test_buff2[0][i] = (rand() & UNSIGNED_SHORT_MAX) - (rand() & UNSIGNED_SHORT_MAX);
> + short_denoise_test_buff1[0][i] = short_denoise_test_buff2[0][i] = (rand() & SHORT_MAX) - (rand() & SHORT_MAX);
>
> short_test_buff[1][i] = -PIXEL_MAX;
> int_test_buff[1][i] = -PIXEL_MAX;
> int_idct_test_buff[1][i] = SHORT_MIN;
> - int_denoise_test_buff1[1][i] = int_denoise_test_buff2[1][i] = -UNSIGNED_SHORT_MAX;
> + short_denoise_test_buff1[1][i] = short_denoise_test_buff2[1][i] = -SHORT_MAX;
>
> short_test_buff[2][i] = PIXEL_MAX;
> int_test_buff[2][i] = PIXEL_MAX;
> int_idct_test_buff[2][i] = SHORT_MAX;
> - int_denoise_test_buff1[2][i] = int_denoise_test_buff2[2][i] = UNSIGNED_SHORT_MAX;
> + short_denoise_test_buff1[2][i] = short_denoise_test_buff2[2][i] = SHORT_MAX;
>
> mbuf1[i] = rand() & PIXEL_MAX;
> mbufdct[i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
> @@ -96,16 +96,16 @@
> bool MBDstHarness::check_dct_primitive(dct_t ref, dct_t opt, intptr_t width)
> {
> int j = 0;
> - intptr_t cmp_size = sizeof(int) * width * width;
> + intptr_t cmp_size = sizeof(short) * width * width;
>
> for (int i = 0; i < ITERS; i++)
> {
> int index = rand() % TEST_CASES;
>
> - ref(short_test_buff[index] + j, mintbuf3, width);
> - checked(opt, short_test_buff[index] + j, mintbuf4, width);
> + ref(short_test_buff[index] + j, mshortbuf2, width);
> + checked(opt, short_test_buff[index] + j, mshortbuf3, width);
>
> - if (memcmp(mintbuf3, mintbuf4, cmp_size))
> + if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
> return false;
>
> reportfail();
> @@ -124,8 +124,8 @@
> {
> int index = rand() % TEST_CASES;
>
> - ref(int_idct_test_buff[index] + j, mshortbuf2, width);
> - checked(opt, int_idct_test_buff[index] + j, mshortbuf3, width);
> + ref(short_test_buff[index] + j, mshortbuf2, width);
> + checked(opt, short_test_buff[index] + j, mshortbuf3, width);
>
> if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
> return false;
> @@ -156,10 +156,10 @@
> int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
> int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
>
> - ref(short_test_buff[index] + j, mintbuf3, width * height, scale, shift);
> - checked(opt, short_test_buff[index] + j, mintbuf4, width * height, scale, shift);
> + ref(short_test_buff[index] + j, mshortbuf2, width * height, scale, shift);
> + checked(opt, short_test_buff[index] + j, mshortbuf3, width * height, scale, shift);
>
> - if (memcmp(mintbuf3, mintbuf4, sizeof(int) * height * width))
> + if (memcmp(mshortbuf2, mshortbuf3, sizeof(int16_t) * height * width))
> return false;
>
> reportfail();
> @@ -175,6 +175,10 @@
>
> for (int i = 0; i < ITERS; i++)
> {
> +
> + memset(mshortbuf2, 0, MAX_TU_SIZE * sizeof(int16_t));
> + memset(mshortbuf3, 0, MAX_TU_SIZE * sizeof(int16_t));
> +
> int log2TrSize = (rand() % 4) + 2;
>
> int width = (1 << log2TrSize);
> @@ -185,13 +189,13 @@
> int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
> int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
>
> - int cmp_size = sizeof(int) * height * width;
> + int cmp_size = sizeof(int16_t) * height * width;
> int index1 = rand() % TEST_CASES;
>
> - ref(short_test_buff[index1] + j, mintbuf3, mintbuf1, width * height, per, shift);
> - checked(opt, short_test_buff[index1] + j, mintbuf4, mintbuf2, width * height, per, shift);
> + ref(short_test_buff[index1] + j, int_test_buff[index1] + j, mshortbuf2, width * height, per, shift);
> + checked(opt, short_test_buff[index1] + j, int_test_buff[index1] + j, mshortbuf3, width * height, per, shift);
>
> - if (memcmp(mintbuf1, mintbuf2, cmp_size))
> + if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
> return false;
>
> reportfail();
> @@ -222,8 +226,8 @@
> int index1 = rand() % TEST_CASES;
> int index2 = rand() % TEST_CASES;
>
> - refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mshortbuf2, bits, valueToAdd, numCoeff);
> - optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mshortbuf3, bits, valueToAdd, numCoeff);
> + refReturnValue = ref(short_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf1, mshortbuf2, bits, valueToAdd, numCoeff);
> + optReturnValue = (uint32_t)checked(opt, short_test_buff[index1] + j, int_test_buff[index2] + j, mintbuf3, mshortbuf3, bits, valueToAdd, numCoeff);
>
> if (memcmp(mintbuf1, mintbuf3, cmp_size))
> return false;
> @@ -261,8 +265,8 @@
> int index1 = rand() % TEST_CASES;
> int index2 = rand() % TEST_CASES;
>
> - refReturnValue = ref(int_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf2, bits, valueToAdd, numCoeff);
> - optReturnValue = (uint32_t)checked(opt, int_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf3, bits, valueToAdd, numCoeff);
> + refReturnValue = ref(short_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf2, bits, valueToAdd, numCoeff);
> + optReturnValue = (uint32_t)checked(opt, short_test_buff[index1] + j, int_test_buff[index2] + j, mshortbuf3, bits, valueToAdd, numCoeff);
>
> if (memcmp(mshortbuf2, mshortbuf3, cmp_size))
> return false;
> @@ -324,6 +328,7 @@
> int log2TrSize = s + 2;
> int num = 1 << (log2TrSize * 2);
> int cmp_size = sizeof(int) * num;
> + int cmp_short = sizeof(short) * num;
>
> for (int i = 0; i < ITERS; i++)
> {
> @@ -336,10 +341,10 @@
>
> int index = rand() % TEST_CASES;
>
> - ref(int_denoise_test_buff1[index] + j, mubuf1, mushortbuf1, num);
> - checked(opt, int_denoise_test_buff2[index] + j, mubuf2, mushortbuf1, num);
> + ref(short_denoise_test_buff1[index] + j, mubuf1, mushortbuf1, num);
> + checked(opt, short_denoise_test_buff2[index] + j, mubuf2, mushortbuf1, num);
>
> - if (memcmp(int_denoise_test_buff1[index] + j, int_denoise_test_buff2[index] + j, cmp_size))
> + if (memcmp(short_denoise_test_buff1[index] + j, short_denoise_test_buff2[index] + j, cmp_short))
> return false;
>
> if (memcmp(mubuf1, mubuf2, cmp_size))
> @@ -454,7 +459,7 @@
> if (opt.dct[value])
> {
> printf("%s\t", dctInfo[value].name);
> - REPORT_SPEEDUP(opt.dct[value], ref.dct[value], mbuf1, mintbuf3, dctInfo[value].width);
> + REPORT_SPEEDUP(opt.dct[value], ref.dct[value], mbuf1, mshortbuf2, dctInfo[value].width);
> }
> }
>
> @@ -463,32 +468,32 @@
> if (opt.idct[value])
> {
> printf("%s\t", idctInfo[value].name);
> - REPORT_SPEEDUP(opt.idct[value], ref.idct[value], mbufidct, mshortbuf2, idctInfo[value].width);
> + REPORT_SPEEDUP(opt.idct[value], ref.idct[value], mshortbuf3, mshortbuf2, idctInfo[value].width);
> }
> }
>
> if (opt.dequant_normal)
> {
> printf("dequant_normal\t");
> - REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mintbuf3, 32 * 32, 70, 1);
> + REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, 70, 1);
> }
>
> if (opt.dequant_scaling)
> {
> printf("dequant_scaling\t");
> - REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mintbuf4, 32 * 32, 5, 1);
> + REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mshortbuf2, 32 * 32, 5, 1);
> }
>
> if (opt.quant)
> {
> printf("quant\t\t");
> - REPORT_SPEEDUP(opt.quant, ref.quant, int_test_buff[0], int_test_buff[1], mintbuf3, mshortbuf2, 23, 23785, 32 * 32);
> + REPORT_SPEEDUP(opt.quant, ref.quant, short_test_buff[0], int_test_buff[1], mintbuf3, mshortbuf2, 23, 23785, 32 * 32);
> }
>
> if (opt.nquant)
> {
> printf("nquant\t\t");
> - REPORT_SPEEDUP(opt.nquant, ref.nquant, int_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32);
> + REPORT_SPEEDUP(opt.nquant, ref.nquant, short_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32);
> }
>
> if (opt.count_nonzero)
> @@ -503,7 +508,7 @@
> if (opt.denoiseDct)
> {
> printf("denoiseDct\t");
> - REPORT_SPEEDUP(opt.denoiseDct, ref.denoiseDct, int_denoise_test_buff1[0], mubuf1, mushortbuf1, 32 * 32);
> + REPORT_SPEEDUP(opt.denoiseDct, ref.denoiseDct, short_denoise_test_buff1[0], mubuf1, mushortbuf1, 32 * 32);
> }
>
> }
> diff -r 2f0062f0791b -r 706fa4af912b source/test/mbdstharness.h
> --- a/source/test/mbdstharness.h Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/test/mbdstharness.h Tue Nov 18 14:00:27 2014 +0530
> @@ -60,8 +60,8 @@
> uint32_t mubuf2[MAX_TU_SIZE];
> uint16_t mushortbuf1[MAX_TU_SIZE];
>
> - int int_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE];
> - int int_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE];
> + int16_t short_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE];
> + int16_t short_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE];
>
> bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt);
> bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt);
> diff -r 2f0062f0791b -r 706fa4af912b source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/test/pixelharness.cpp Tue Nov 18 14:00:27 2014 +0530
> @@ -344,39 +344,11 @@
> return true;
> }
>
> -bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt)
> +bool PixelHarness::check_copy16to16_shl_t(cpy16to16_shl_t ref, cpy16to16_shl_t opt)
> {
> ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
> ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
>
> - memset(ref_dest, 0xCD, sizeof(ref_dest));
> - memset(opt_dest, 0xCD, sizeof(opt_dest));
> -
> - int j = 0;
> - intptr_t stride = STRIDE;
> - for (int i = 0; i < ITERS; i++)
> - {
> - int shift = (rand() % 7 + 1);
> -
> - int index = i % TEST_CASES;
> - checked(opt, opt_dest, int_test_buff[index] + j, stride, shift, (int)STRIDE);
> - ref(ref_dest, int_test_buff[index] + j, stride, shift, (int)STRIDE);
> -
> - if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
> - return false;
> -
> - reportfail();
> - j += INCR;
> - }
> -
> - return true;
> -}
> -
> -bool PixelHarness::check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt)
> -{
> - ALIGN_VAR_16(int32_t, ref_dest[64 * 64]);
> - ALIGN_VAR_16(int32_t, opt_dest[64 * 64]);
> -
> int j = 0;
> intptr_t stride = STRIDE;
> for (int i = 0; i < ITERS; i++)
> @@ -387,7 +359,7 @@
> checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
> ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
>
> - if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
> + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
> return false;
>
> reportfail();
> @@ -1337,20 +1309,11 @@
>
> }
>
> - if (opt.cvt32to16_shr)
> + if (opt.cpy16to16_shl)
> {
> - if (!check_cvt32to16_shr_t(ref.cvt32to16_shr, opt.cvt32to16_shr))
> + if (!check_copy16to16_shl_t(ref.cpy16to16_shl, opt.cpy16to16_shl))
> {
> - printf("cvt32to16 failed!\n");
> - return false;
> - }
> - }
> -
> - if (opt.cvt16to32_shl)
> - {
> - if (!check_cvt16to32_shl_t(ref.cvt16to32_shl, opt.cvt16to32_shl))
> - {
> - printf("cvt16to32_shl failed!\n");
> + printf("copy16to16_shl failed!\n");
> return false;
> }
> }
> @@ -1700,16 +1663,10 @@
>
> }
>
> - if (opt.cvt32to16_shr)
> + if (opt.cpy16to16_shl)
> {
> - HEADER0("cvt32to16_shr");
> - REPORT_SPEEDUP(opt.cvt32to16_shr, ref.cvt32to16_shr, sbuf1, ibuf1, 64, 5, 64);
> - }
> -
> - if (opt.cvt16to32_shl)
> - {
> - HEADER0("cvt16to32_shl");
> - REPORT_SPEEDUP(opt.cvt16to32_shl, ref.cvt16to32_shl, ibuf1, sbuf1, 64, 5, 64);
> + HEADER0("cpy16to16_shl");
> + REPORT_SPEEDUP(opt.cpy16to16_shl, ref.cpy16to16_shl, sbuf2, sbuf1, 64, 5, 64);
> }
>
> if (opt.weight_pp)
> diff -r 2f0062f0791b -r 706fa4af912b source/test/pixelharness.h
> --- a/source/test/pixelharness.h Tue Nov 18 11:31:39 2014 +0530
> +++ b/source/test/pixelharness.h Tue Nov 18 14:00:27 2014 +0530
> @@ -80,8 +80,7 @@
> bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);
> bool check_weightp(weightp_sp_t ref, weightp_sp_t opt);
> bool check_downscale_t(downscale_t ref, downscale_t opt);
> - bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt);
> - bool check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt);
> + bool check_copy16to16_shl_t(cpy16to16_shl_t ref, cpy16to16_shl_t opt);
> bool check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt);
> bool check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt);
> bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list