[x265] primitives: refactor tskip related
Satoshi Nakagawa
nakagawa424 at oki.com
Thu Nov 27 02:14:26 CET 2014
# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1417050723 -32400
# Thu Nov 27 10:12:03 2014 +0900
# Node ID b4454aa1b6ab610c20241eb8fd5c73268b1ae3e0
# Parent dfe0803ae6be925281cd6101fc0354a34bedfefd
primitives: refactor tskip related
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/dct.cpp
--- a/source/common/dct.cpp Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/dct.cpp Thu Nov 27 10:12:03 2014 +0900
@@ -440,7 +440,7 @@
}
}
-void dst4_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 1 + X265_DEPTH - 8;
const int shift_2nd = 8;
@@ -450,14 +450,14 @@
for (int i = 0; i < 4; i++)
{
- memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
+ memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
}
fastForwardDst(block, coef, shift_1st);
fastForwardDst(coef, dst, shift_2nd);
}
-void dct4_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 1 + X265_DEPTH - 8;
const int shift_2nd = 8;
@@ -467,14 +467,14 @@
for (int i = 0; i < 4; i++)
{
- memcpy(&block[i * 4], &src[i * stride], 4 * sizeof(int16_t));
+ memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
}
partialButterfly4(block, coef, shift_1st, 4);
partialButterfly4(coef, dst, shift_2nd, 4);
}
-void dct8_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 2 + X265_DEPTH - 8;
const int shift_2nd = 9;
@@ -484,14 +484,14 @@
for (int i = 0; i < 8; i++)
{
- memcpy(&block[i * 8], &src[i * stride], 8 * sizeof(int16_t));
+ memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
}
partialButterfly8(block, coef, shift_1st, 8);
partialButterfly8(coef, dst, shift_2nd, 8);
}
-void dct16_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 3 + X265_DEPTH - 8;
const int shift_2nd = 10;
@@ -501,14 +501,14 @@
for (int i = 0; i < 16; i++)
{
- memcpy(&block[i * 16], &src[i * stride], 16 * sizeof(int16_t));
+ memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
}
partialButterfly16(block, coef, shift_1st, 16);
partialButterfly16(coef, dst, shift_2nd, 16);
}
-void dct32_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
{
const int shift_1st = 4 + X265_DEPTH - 8;
const int shift_2nd = 11;
@@ -518,14 +518,14 @@
for (int i = 0; i < 32; i++)
{
- memcpy(&block[i * 32], &src[i * stride], 32 * sizeof(int16_t));
+ memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
}
partialButterfly32(block, coef, shift_1st, 32);
partialButterfly32(coef, dst, shift_2nd, 32);
}
-void idst4_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -538,11 +538,11 @@
for (int i = 0; i < 4; i++)
{
- memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
}
}
-void idct4_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -555,11 +555,11 @@
for (int i = 0; i < 4; i++)
{
- memcpy(&dst[i * stride], &block[i * 4], 4 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
}
}
-void idct8_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -569,13 +569,14 @@
partialButterflyInverse8(src, coef, shift_1st, 8);
partialButterflyInverse8(coef, block, shift_2nd, 8);
+
for (int i = 0; i < 8; i++)
{
- memcpy(&dst[i * stride], &block[i * 8], 8 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t));
}
}
-void idct16_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -585,13 +586,14 @@
partialButterflyInverse16(src, coef, shift_1st, 16);
partialButterflyInverse16(coef, block, shift_2nd, 16);
+
for (int i = 0; i < 16; i++)
{
- memcpy(&dst[i * stride], &block[i * 16], 16 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
}
}
-void idct32_c(const int16_t *src, int16_t *dst, intptr_t stride)
+void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
{
const int shift_1st = 7;
const int shift_2nd = 12 - (X265_DEPTH - 8);
@@ -604,7 +606,7 @@
for (int i = 0; i < 32; i++)
{
- memcpy(&dst[i * stride], &block[i * 32], 32 * sizeof(int16_t));
+ memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
}
}
@@ -632,7 +634,7 @@
}
}
-void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int16_t* coef, int num, int per, int shift)
+void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
{
X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
@@ -724,15 +726,15 @@
}
template<int trSize>
-uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t stride)
+uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
{
uint32_t numSig = 0;
for (int k = 0; k < trSize; k++)
{
for (int j = 0; j < trSize; j++)
{
- coeff[k * trSize + j] = residual[k * stride + j];
- numSig += (residual[k * stride + j] != 0);
+ coeff[k * trSize + j] = residual[k * resiStride + j];
+ numSig += (residual[k * resiStride + j] != 0);
}
}
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/pixel.cpp
--- a/source/common/pixel.cpp Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/pixel.cpp Thu Nov 27 10:12:03 2014 +0900
@@ -32,32 +32,32 @@
using namespace x265;
-#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
- p.FUNC_PREFIX[LUMA_4x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_4x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x12] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_12x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_4x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x24] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_24x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x48] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_48x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
+#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \
+ p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
#define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \
@@ -491,73 +491,73 @@
}
}
-void copy16to16_shl(int16_t *dst, const int16_t *src, intptr_t stride, int shift, int size)
+template<int size>
+void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
{
- X265_CHECK(!(size & 3), "invalid size\n");
+ X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+ X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
+ X265_CHECK(shift >= 0, "invalid shift\n");
+
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[i * size + j] = src[i * stride + j] << shift;
- }
+ dst[j] = src[j] << shift;
+
+ src += srcStride;
+ dst += size;
}
}
template<int size>
-void convert16to32_shr(int32_t* dst, const int16_t* src, intptr_t stride, int shift, int offset)
+void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
{
+ X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+ X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
+ X265_CHECK(shift > 0, "invalid shift\n");
+
+ int16_t round = 1 << (shift - 1);
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[i * size + j] = ((int)src[i * stride + j] + offset) >> shift;
- }
+ dst[j] = (src[j] + round) >> shift;
+
+ src += srcStride;
+ dst += size;
}
}
-void copy_shr(int16_t* dst, const int16_t* src, intptr_t stride, int shift, int size)
+template<int size>
+void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
{
- int round = 1 << (shift - 1);
+ X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
+ X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+ X265_CHECK(shift >= 0, "invalid shift\n");
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[j] = (int16_t)((src[j] + round) >> shift);
- }
+ dst[j] = src[j] << shift;
src += size;
- dst += stride;
+ dst += dstStride;
}
}
template<int size>
-void convert32to16_shl(int16_t* dst, const int32_t* src, intptr_t stride, int shift)
+void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
{
+ X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
+ X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+ X265_CHECK(shift > 0, "invalid shift\n");
+
+ int16_t round = 1 << (shift - 1);
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[j] = ((int16_t)src[j] << shift);
- }
+ dst[j] = (src[j] + round) >> shift;
src += size;
- dst += stride;
- }
-}
-
-template<int size>
-void copy_shl(int16_t* dst, const int16_t* src, intptr_t stride, int shift)
-{
- for (int i = 0; i < size; i++)
- {
- for (int j = 0; j < size; j++)
- {
- dst[j] = (src[j] << shift);
- }
-
- src += size;
- dst += stride;
+ dst += dstStride;
}
}
@@ -1263,9 +1263,9 @@
CHROMA_444(64, 16);
CHROMA_444(16, 64);
- SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, pixel, pixel)
- SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, pixelcmp_sp_t, int16_t, pixel)
- SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, pixelcmp_ss_t, int16_t, int16_t)
+ SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel)
+ SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel)
+ SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t)
p.blockfill_s[BLOCK_4x4] = blockfil_s_c<4>;
p.blockfill_s[BLOCK_8x8] = blockfil_s_c<8>;
@@ -1273,21 +1273,22 @@
p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
- p.cpy16to16_shl = copy16to16_shl;
- p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>;
- p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>;
- p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>;
- p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>;
- p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>;
- p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>;
- p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>;
- p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>;
-
- p.copy_shr = copy_shr;
- p.copy_shl[BLOCK_4x4] = copy_shl<4>;
- p.copy_shl[BLOCK_8x8] = copy_shl<8>;
- p.copy_shl[BLOCK_16x16] = copy_shl<16>;
- p.copy_shl[BLOCK_32x32] = copy_shl<32>;
+ p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>;
+ p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>;
+ p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>;
+ p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>;
+ p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>;
+ p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>;
+ p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>;
+ p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>;
+ p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>;
+ p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>;
+ p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>;
+ p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>;
+ p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>;
+ p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>;
+ p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>;
+ p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>;
p.sa8d[BLOCK_4x4] = satd_4x4;
p.sa8d[BLOCK_8x8] = sa8d_8x8;
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/primitives.h
--- a/source/common/primitives.h Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/primitives.h Thu Nov 27 10:12:03 2014 +0900
@@ -138,32 +138,27 @@
typedef int (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
-typedef void (*blockcpy_sp_t)(int bx, int by, int16_t* dst, intptr_t dstride, const pixel* src, intptr_t sstride); // dst is aligned
-typedef void (*blockcpy_sc_t)(int bx, int by, int16_t* dst, intptr_t dstride, const uint8_t* src, intptr_t sstride); // dst is aligned
-typedef void (*pixelsub_ps_t)(int bx, int by, int16_t* dst, intptr_t dstride, const pixel* src0, const pixel* src1, intptr_t sstride0, intptr_t sstride1);
typedef void (*pixelavg_pp_t)(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int weight);
typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel* refLeft, pixel* refAbove, int dirMode, int bFilter);
typedef void (*intra_allangs_t)(pixel* dst, pixel* above0, pixel* left0, pixel* above1, pixel* left1, int bLuma);
-typedef void (*cpy16to16_shl_t)(int16_t* dst, const int16_t* src, intptr_t, int, int);
-typedef void (*cvt16to32_shl_t)(int32_t* dst, const int16_t* src, intptr_t, int, int);
-typedef void (*cvt16to32_shr_t)(int32_t* dst, const int16_t* src, intptr_t, int, int);
-typedef void (*cvt32to16_shl_t)(int16_t* dst, const int32_t* src, intptr_t, int);
-typedef uint32_t (*copy_cnt_t)(int16_t* coeff, const int16_t* residual, intptr_t stride);
-typedef void (*copy_shr_t)(int16_t* dst, const int16_t* src, intptr_t stride, int shift, int size);
-typedef void (*copy_shl_t)(int16_t* dst, const int16_t* src, intptr_t stride, int shift);
+typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+typedef void (*cpy1Dto2D_shl_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+typedef void (*cpy1Dto2D_shr_t)(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+typedef uint32_t (*copy_cnt_t)(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
-typedef void (*dct_t)(const int16_t* src, int16_t* dst, intptr_t stride);
-typedef void (*idct_t)(const int16_t* src, int16_t* dst, intptr_t stride);
+typedef void (*dct_t)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+typedef void (*idct_t)(const int16_t* src, int16_t* dst, intptr_t dstStride);
typedef void (*denoiseDct_t)(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff);
typedef void (*calcresidual_t)(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
typedef void (*transpose_t)(pixel* dst, const pixel* src, intptr_t stride);
-typedef uint32_t (*quant_t)(const int16_t *coef, const int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
-typedef uint32_t (*nquant_t)(const int16_t *coef, const int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
-typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t*vdequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
+typedef uint32_t (*quant_t)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+typedef uint32_t (*nquant_t)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
+typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
typedef int (*count_nonzero_t)(const int16_t* quantCoeff, int numCoeff);
@@ -186,7 +181,7 @@
typedef void (*filter_hv_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
-typedef void (*copy_pp_t)(pixel* dst, intptr_t dstride, const pixel* src, intptr_t sstride); // dst is aligned
+typedef void (*copy_pp_t)(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); // dst is aligned
typedef void (*copy_sp_t)(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
typedef void (*copy_ps_t)(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
typedef void (*copy_ss_t)(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
@@ -195,7 +190,7 @@
typedef void (*pixel_add_ps_t)(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
-typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
+typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
@@ -220,12 +215,11 @@
pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS];
blockfill_s_t blockfill_s[NUM_SQUARE_BLOCKS]; // block fill with value
- cpy16to16_shl_t cpy16to16_shl;
- cvt16to32_shr_t cvt16to32_shr[NUM_SQUARE_BLOCKS - 1];
- cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1];
+ cpy2Dto1D_shl_t cpy2Dto1D_shl[NUM_SQUARE_BLOCKS - 1];
+ cpy2Dto1D_shr_t cpy2Dto1D_shr[NUM_SQUARE_BLOCKS - 1];
+ cpy1Dto2D_shl_t cpy1Dto2D_shl[NUM_SQUARE_BLOCKS - 1];
+ cpy1Dto2D_shr_t cpy1Dto2D_shr[NUM_SQUARE_BLOCKS - 1];
copy_cnt_t copy_cnt[NUM_SQUARE_BLOCKS - 1];
- copy_shr_t copy_shr;
- copy_shl_t copy_shl[NUM_SQUARE_BLOCKS - 1];
copy_pp_t luma_copy_pp[NUM_LUMA_PARTITIONS];
copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS];
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/quant.cpp
--- a/source/common/quant.cpp Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/quant.cpp Thu Nov 27 10:12:03 2014 +0900
@@ -322,49 +322,46 @@
return numSig;
}
-uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t stride,
+uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride,
coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
{
+ const uint32_t sizeIdx = log2TrSize - 2;
if (cu.m_tqBypass[absPartIdx])
{
X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
- return primitives.copy_cnt[log2TrSize - 2](coeff, residual, stride);
+ return primitives.copy_cnt[sizeIdx](coeff, residual, resiStride);
}
bool isLuma = ttype == TEXT_LUMA;
bool usePsy = m_psyRdoqScale && isLuma && !useTransformSkip;
int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
- int trSize = 1 << log2TrSize;
X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n");
if (useTransformSkip)
{
#if X265_DEPTH <= 10
- primitives.cpy16to16_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
+ X265_CHECK(transformShift >= 0, "invalid transformShift\n");
+ primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
#else
if (transformShift >= 0)
- primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
+ primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
else
- {
- int shift = -transformShift;
- int offset = (1 << (shift - 1));
- primitives.cvt16to32_shr[log2TrSize - 2](m_resiDctCoeff, residual, stride, shift, offset);
- }
+ primitives.cpy2Dto1D_shr[sizeIdx](m_resiDctCoeff, residual, resiStride, -transformShift);
#endif
}
else
{
bool isIntra = cu.isIntra(absPartIdx);
- const uint32_t sizeIdx = log2TrSize - 2;
int useDST = !sizeIdx && isLuma && isIntra;
int index = DCT_4x4 + sizeIdx - useDST;
- primitives.dct[index](residual, m_resiDctCoeff, stride);
+ primitives.dct[index](residual, m_resiDctCoeff, resiStride);
/* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
* there is no risk of performing this DCT unnecessarily */
if (usePsy)
{
+ int trSize = 1 << log2TrSize;
/* perform DCT on source pixels for psy-rdoq */
primitives.square_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize);
@@ -408,12 +405,13 @@
}
}
-void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, const coeff_t* coeff,
+void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
{
+ const uint32_t sizeIdx = log2TrSize - 2;
if (transQuantBypass)
{
- primitives.copy_shl[log2TrSize - 2](residual, coeff, stride, 0);
+ primitives.cpy1Dto2D_shl[sizeIdx](residual, coeff, resiStride, 0);
return;
}
@@ -427,7 +425,7 @@
if (m_scalingList->m_bEnabled)
{
int scalingListType = (bIntra ? 0 : 3) + ttype;
- const int32_t* dequantCoef = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
+ const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem];
primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
}
else
@@ -438,20 +436,18 @@
if (useTransformSkip)
{
- int trSize = 1 << log2TrSize;
-
#if X265_DEPTH <= 10
- primitives.copy_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
+ X265_CHECK(transformShift > 0, "invalid transformShift\n");
+ primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
#else
if (transformShift > 0)
- primitives.copy_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
+ primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
else
- primitives.cvt32to16_shl[log2TrSize - 2](residual, m_resiDctCoeff, stride, -transformShift);
+ primitives.cpy1Dto2D_shl[sizeIdx](residual, m_resiDctCoeff, resiStride, -transformShift);
#endif
}
else
{
- const uint32_t sizeIdx = log2TrSize - 2;
int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n");
@@ -459,17 +455,17 @@
// DC only
if (numSig == 1 && coeff[0] != 0 && !useDST)
{
- const int shift_1st = 7;
+ const int shift_1st = 7 - 6;
const int add_1st = 1 << (shift_1st - 1);
- const int shift_2nd = 12 - (X265_DEPTH - 8);
+ const int shift_2nd = 12 - (X265_DEPTH - 8) - 3;
const int add_2nd = 1 << (shift_2nd - 1);
- int dc_val = (((m_resiDctCoeff[0] * 64 + add_1st) >> shift_1st) * 64 + add_2nd) >> shift_2nd;
- primitives.blockfill_s[sizeIdx](residual, stride, (int16_t)dc_val);
+ int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
+ primitives.blockfill_s[sizeIdx](residual, resiStride, (int16_t)dc_val);
return;
}
- primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, stride);
+ primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, resiStride);
}
}
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/quant.h
--- a/source/common/quant.h Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/quant.h Thu Nov 27 10:12:03 2014 +0900
@@ -104,10 +104,10 @@
/* CU setup */
void setQPforQuant(const CUData& ctu);
- uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencstride, const int16_t* residual, uint32_t stride, coeff_t* coeff,
+ uint32_t transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, coeff_t* coeff,
uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip);
- void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, const coeff_t* coeff,
+ void invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig);
/* static methods shared with entropy.cpp */
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp Thu Nov 27 10:12:03 2014 +0900
@@ -1336,10 +1336,22 @@
p.sad_x4[LUMA_16x4] = x265_pixel_sad_x4_16x4_sse2;
p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_mmx2;
- p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
- p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
- p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
- p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
+ p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
+ p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
+ p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
+ p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
+ p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
+ p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
+ p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
+ p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
+ p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
+ p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
+ p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
+ p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
+ p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
+ p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
+ p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
+ p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
CHROMA_PIXELSUB_PS(_sse2);
CHROMA_PIXELSUB_PS_422(_sse2);
@@ -1406,10 +1418,6 @@
p.quant = x265_quant_sse4;
p.nquant = x265_nquant_sse4;
p.dequant_normal = x265_dequant_normal_sse4;
- p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
- p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
- p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
- p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
p.intra_pred[0][BLOCK_4x4] = x265_intra_pred_planar4_sse4;
p.intra_pred[0][BLOCK_8x8] = x265_intra_pred_planar8_sse4;
p.intra_pred[0][BLOCK_16x16] = x265_intra_pred_planar16_sse4;
@@ -1438,6 +1446,14 @@
p.nquant = x265_nquant_avx2;
p.dequant_normal = x265_dequant_normal_avx2;
p.scale1D_128to64 = x265_scale1D_128to64_avx2;
+ p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
+ p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
+ p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
+ p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
+ p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
+ p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
+ p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
+ p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
#if X86_64
p.dct[DCT_8x8] = x265_dct8_avx2;
p.dct[DCT_16x16] = x265_dct16_avx2;
@@ -1548,11 +1564,23 @@
p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
SA8D_INTER_FROM_BLOCK(sse2);
- p.cpy16to16_shl = x265_copy16to16_shl_sse2;
- p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
- p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
- p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
- p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_sse2;
+ p.cpy2Dto1D_shl[BLOCK_4x4] = x265_cpy2Dto1D_shl_4_sse2;
+ p.cpy2Dto1D_shl[BLOCK_8x8] = x265_cpy2Dto1D_shl_8_sse2;
+ p.cpy2Dto1D_shl[BLOCK_16x16] = x265_cpy2Dto1D_shl_16_sse2;
+ p.cpy2Dto1D_shl[BLOCK_32x32] = x265_cpy2Dto1D_shl_32_sse2;
+ p.cpy2Dto1D_shr[BLOCK_4x4] = x265_cpy2Dto1D_shr_4_sse2;
+ p.cpy2Dto1D_shr[BLOCK_8x8] = x265_cpy2Dto1D_shr_8_sse2;
+ p.cpy2Dto1D_shr[BLOCK_16x16] = x265_cpy2Dto1D_shr_16_sse2;
+ p.cpy2Dto1D_shr[BLOCK_32x32] = x265_cpy2Dto1D_shr_32_sse2;
+ p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_sse2;
+ p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_sse2;
+ p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_sse2;
+ p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_sse2;
+ p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_sse2;
+ p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_sse2;
+ p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_sse2;
+ p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_sse2;
+
p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
p.transpose[BLOCK_4x4] = x265_transpose4_sse2;
@@ -1568,10 +1596,6 @@
p.idct[IDST_4x4] = x265_idst4_sse2;
p.planecopy_sp = x265_downShift_16_sse2;
- p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
- p.copy_shl[BLOCK_8x8] = x265_copy_shl_8_sse2;
- p.copy_shl[BLOCK_16x16] = x265_copy_shl_16_sse2;
- p.copy_shl[BLOCK_32x32] = x265_copy_shl_32_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
@@ -1615,10 +1639,6 @@
LUMA_ADDAVG(_sse4);
CHROMA_ADDAVG(_sse4);
CHROMA_ADDAVG_422(_sse4);
- p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
- p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
- p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
- p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_sse4;
// TODO: check POPCNT flag!
p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_sse4;
@@ -1688,7 +1708,6 @@
INTRA_ANG_SSE4(sse4);
p.dct[DCT_8x8] = x265_dct8_sse4;
- p.copy_shr = x265_copy_shr_sse4;
// p.denoiseDct = x265_denoise_dct_sse4;
}
if (cpuMask & X265_CPU_AVX)
@@ -1759,10 +1778,14 @@
p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2;
- p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
- p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
- p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
- p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
+ p.cpy1Dto2D_shl[BLOCK_4x4] = x265_cpy1Dto2D_shl_4_avx2;
+ p.cpy1Dto2D_shl[BLOCK_8x8] = x265_cpy1Dto2D_shl_8_avx2;
+ p.cpy1Dto2D_shl[BLOCK_16x16] = x265_cpy1Dto2D_shl_16_avx2;
+ p.cpy1Dto2D_shl[BLOCK_32x32] = x265_cpy1Dto2D_shl_32_avx2;
+ p.cpy1Dto2D_shr[BLOCK_4x4] = x265_cpy1Dto2D_shr_4_avx2;
+ p.cpy1Dto2D_shr[BLOCK_8x8] = x265_cpy1Dto2D_shr_8_avx2;
+ p.cpy1Dto2D_shr[BLOCK_16x16] = x265_cpy1Dto2D_shr_16_avx2;
+ p.cpy1Dto2D_shr[BLOCK_32x32] = x265_cpy1Dto2D_shr_32_avx2;
// p.denoiseDct = x265_denoise_dct_avx2;
p.dct[DCT_4x4] = x265_dct4_avx2;
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/x86/blockcopy8.asm Thu Nov 27 10:12:03 2014 +0900
@@ -41,7 +41,7 @@
SECTION .text
;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_2x4, 4, 7, 0
@@ -59,7 +59,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_2x8, 4, 7, 0
@@ -97,7 +97,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_2x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_2x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_2x16, 4, 7, 0
@@ -115,7 +115,7 @@
;-----------------------------------------------------------------------------
-; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_4x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_4x2, 4, 6, 0
@@ -127,7 +127,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_4x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_4x4, 4, 4, 4
@@ -145,7 +145,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W4_H8 2
INIT_XMM sse2
@@ -192,7 +192,7 @@
BLOCKCOPY_PP_W4_H8 4, 32
;-----------------------------------------------------------------------------
-; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_6x8, 4, 7, 8
@@ -257,7 +257,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_6x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_6x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_6x16, 4, 7, 2
@@ -279,7 +279,7 @@
;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_8x2, 4, 4, 2
@@ -291,7 +291,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_8x4, 4, 4, 4
@@ -309,7 +309,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x6(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_8x6, 4, 7, 6
@@ -333,7 +333,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_8x12(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_8x12, 4, 5, 2
@@ -350,7 +350,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W8_H8 2
INIT_XMM sse2
@@ -397,7 +397,7 @@
BLOCKCOPY_PP_W8_H8 8, 64
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W12_H4 2
INIT_XMM sse2
@@ -439,7 +439,7 @@
BLOCKCOPY_PP_W12_H4 12, 32
;-----------------------------------------------------------------------------
-; void blockcopy_pp_16x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W16_H4 2
INIT_XMM sse2
@@ -471,7 +471,7 @@
BLOCKCOPY_PP_W16_H4 16, 12
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W16_H8 2
INIT_XMM sse2
@@ -519,7 +519,7 @@
BLOCKCOPY_PP_W16_H8 16, 24
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W24_H4 2
INIT_XMM sse2
@@ -560,7 +560,7 @@
BLOCKCOPY_PP_W24_H4 24, 64
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W32_H4 2
INIT_XMM sse2
@@ -684,7 +684,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_32x24(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_32x24(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_YMM avx
cglobal blockcopy_pp_32x24, 4, 7, 6
@@ -722,7 +722,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W32_H16_avx 2
INIT_YMM avx
@@ -788,7 +788,7 @@
BLOCKCOPY_PP_W32_H16_avx 32, 64
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W48_H2 2
INIT_XMM sse2
@@ -836,7 +836,7 @@
BLOCKCOPY_PP_W48_H2 48, 64
;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W64_H4 2
INIT_XMM sse2
@@ -897,7 +897,7 @@
BLOCKCOPY_PP_W64_H4 64, 64
;-----------------------------------------------------------------------------
-; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal blockcopy_sp_2x4, 4, 5, 2
@@ -926,7 +926,7 @@
;-----------------------------------------------------------------------------
-; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_2x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal blockcopy_sp_2x8, 4, 5, 2
@@ -974,11 +974,11 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W2_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 7, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 7, 2, dst, dstStride, src, srcStride
add r3, r3
mov r6d, %2/2
.loop:
@@ -1003,10 +1003,10 @@
BLOCKCOPY_SP_W2_H2 2, 16
;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_4x2, 4, 4, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x2, 4, 4, 2, dst, dstStride, src, srcStride
add r3, r3
@@ -1022,10 +1022,10 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_4x4, 4, 4, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x4, 4, 4, 4, dst, dstStride, src, srcStride
add r3, r3
@@ -1049,10 +1049,10 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_4x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_4x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_4x8, 4, 4, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_4x8, 4, 4, 8, dst, dstStride, src, srcStride
add r3, r3
@@ -1092,11 +1092,11 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W4_H8 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/8
@@ -1150,7 +1150,7 @@
BLOCKCOPY_SP_W4_H8 4, 32
;-----------------------------------------------------------------------------
-; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_6x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal blockcopy_sp_6x8, 4, 4, 2
@@ -1213,11 +1213,11 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W6_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 7, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
add r3, r3
mov r6d, %2/2
.loop:
@@ -1247,10 +1247,10 @@
BLOCKCOPY_SP_W6_H2 6, 16
;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_8x2, 4, 4, 2, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x2, 4, 4, 2, dst, dstStride, src, srcStride
add r3, r3
@@ -1265,10 +1265,10 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_8x4, 4, 4, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x4, 4, 4, 4, dst, dstStride, src, srcStride
add r3, r3
@@ -1290,10 +1290,10 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x6(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x6(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_8x6, 4, 4, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x6, 4, 4, 6, dst, dstStride, src, srcStride
add r3, r3
@@ -1322,10 +1322,10 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_8x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_8x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_sp_8x8, 4, 4, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_8x8, 4, 4, 8, dst, dstStride, src, srcStride
add r3, r3
@@ -1361,11 +1361,11 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W8_H4 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 4, dst, dstStride, src, srcStride
add r3, r3
mov r4d, %2/4
.loop:
@@ -1391,11 +1391,11 @@
BLOCKCOPY_SP_W8_H4 8, 12
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W8_H8 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/8
@@ -1446,11 +1446,11 @@
BLOCKCOPY_SP_W8_H8 8, 64
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W12_H4 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/4
@@ -1503,11 +1503,11 @@
BLOCKCOPY_SP_W12_H4 12, 32
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W16_H4 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/4
@@ -1554,11 +1554,11 @@
BLOCKCOPY_SP_W16_H4 16, 24
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W24_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
mov r4d, %2/2
@@ -1595,11 +1595,11 @@
BLOCKCOPY_SP_W24_H2 24, 64
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W32_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2/2
@@ -1643,11 +1643,11 @@
BLOCKCOPY_SP_W32_H2 32, 48
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W48_H2 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
mov r4d, %2
@@ -1681,11 +1681,11 @@
BLOCKCOPY_SP_W48_H2 48, 64
;-----------------------------------------------------------------------------
-; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SP_W64_H1 2
INIT_XMM sse2
-cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
+cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
mov r4d, %2
@@ -1726,10 +1726,10 @@
BLOCKCOPY_SP_W64_H1 64, 64
;-----------------------------------------------------------------------------
-; void blockfill_s_4x4(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockfill_s_4x4, 3, 3, 1, dest, destStride, val
+cglobal blockfill_s_4x4, 3, 3, 1, dst, dstStride, val
add r1, r1
@@ -1745,10 +1745,10 @@
RET
;-----------------------------------------------------------------------------
-; void blockfill_s_8x8(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockfill_s_8x8, 3, 3, 1, dest, destStride, val
+cglobal blockfill_s_8x8, 3, 3, 1, dst, dstStride, val
add r1, r1
@@ -1774,11 +1774,11 @@
RET
;-----------------------------------------------------------------------------
-; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
;-----------------------------------------------------------------------------
%macro BLOCKFILL_S_W16_H8 2
INIT_XMM sse2
-cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
+cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
mov r3d, %2/8
@@ -1855,11 +1855,11 @@
RET
;-----------------------------------------------------------------------------
-; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
+; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
;-----------------------------------------------------------------------------
%macro BLOCKFILL_S_W32_H4 2
INIT_XMM sse2
-cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
+cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
mov r3d, %2/4
@@ -1983,10 +1983,10 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_2x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x4, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
@@ -2013,10 +2013,10 @@
;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x8(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x8(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_2x8, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x8, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
@@ -2065,10 +2065,10 @@
;-----------------------------------------------------------------------------
-; void blockcopy_ps_2x16(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_2x16(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride
+cglobal blockcopy_ps_2x16, 4, 5, 2, dst, dstStride, src, srcStride
add r1, r1
mov r4d, 16/2
.loop:
@@ -2086,10 +2086,10 @@
;-----------------------------------------------------------------------------
-; void blockcopy_ps_4x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_4x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_4x2, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_4x2, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
@@ -2105,10 +2105,10 @@
;-----------------------------------------------------------------------------
-; void blockcopy_ps_4x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_4x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_4x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_4x4, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
@@ -2135,11 +2135,11 @@
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W4_H4 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/4
@@ -2180,11 +2180,11 @@
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W6_H4 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/4
@@ -2227,10 +2227,10 @@
BLOCKCOPY_PS_W6_H4 6, 16
;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_8x2, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x2, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
@@ -2245,10 +2245,10 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_8x4, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x4, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
@@ -2274,10 +2274,10 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_8x6(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_8x6(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_8x6, 4, 4, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_8x6, 4, 4, 1, dst, dstStride, src, srcStride
add r1, r1
@@ -2314,11 +2314,11 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W8_H4 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/4
@@ -2361,11 +2361,11 @@
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W12_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
@@ -2398,10 +2398,10 @@
BLOCKCOPY_PS_W12_H2 12, 32
;-----------------------------------------------------------------------------
-; void blockcopy_ps_16x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_16x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal blockcopy_ps_16x4, 4, 4, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_16x4, 4, 4, 3, dst, dstStride, src, srcStride
add r1, r1
pxor m0, m0
@@ -2436,11 +2436,11 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W16_H4 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/4
@@ -2492,11 +2492,11 @@
BLOCKCOPY_PS_W16_H4 16, 24
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W24_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
@@ -2537,11 +2537,11 @@
BLOCKCOPY_PS_W24_H2 24, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W32_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
@@ -2590,11 +2590,11 @@
BLOCKCOPY_PS_W32_H2 32, 48
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W48_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
@@ -2649,11 +2649,11 @@
BLOCKCOPY_PS_W48_H2 48, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PS_W64_H2 2
INIT_XMM sse4
-cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
add r1, r1
mov r4d, %2/2
@@ -2723,7 +2723,7 @@
BLOCKCOPY_PS_W64_H2 64, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_2x4, 4, 6, 0
@@ -2746,7 +2746,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_2x8, 4, 6, 0
@@ -2785,7 +2785,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_2x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_2x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_2x16, 4, 7, 0
@@ -2805,7 +2805,7 @@
;-----------------------------------------------------------------------------
-; void blockcopy_ss_4x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_4x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_4x2, 4, 4, 2
@@ -2821,7 +2821,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_4x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_4x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_4x4, 4, 4, 4
@@ -2841,7 +2841,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W4_H8 2
INIT_XMM sse2
@@ -2889,7 +2889,7 @@
BLOCKCOPY_SS_W4_H8 4, 32
;-----------------------------------------------------------------------------
-; void blockcopy_ss_6x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_6x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_6x8, 4, 4, 4
@@ -2944,7 +2944,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_6x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_6x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_6x16, 4, 5, 4
@@ -2968,7 +2968,7 @@
;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_8x2, 4, 4, 2
@@ -2984,7 +2984,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_8x4, 4, 4, 4
@@ -3005,7 +3005,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x6(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x6(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_8x6, 4, 4, 4
@@ -3034,7 +3034,7 @@
RET
;-----------------------------------------------------------------------------
-; void blockcopy_ss_8x12(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_8x12(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_ss_8x12, 4, 5, 2
@@ -3054,7 +3054,7 @@
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W8_H8 2
INIT_XMM sse2
@@ -3105,7 +3105,7 @@
BLOCKCOPY_SS_W8_H8 8, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W12_H4 2
INIT_XMM sse2
@@ -3149,7 +3149,7 @@
BLOCKCOPY_SS_W12_H4 12, 32
;-----------------------------------------------------------------------------
-; void blockcopy_ss_16x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_16x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W16_H4 2
INIT_XMM sse2
@@ -3192,7 +3192,7 @@
BLOCKCOPY_SS_W16_H4 16, 12
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W16_H4_avx 2
INIT_YMM avx
@@ -3229,7 +3229,7 @@
BLOCKCOPY_SS_W16_H4_avx 16, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W16_H8 2
INIT_XMM sse2
@@ -3302,7 +3302,7 @@
BLOCKCOPY_SS_W16_H8 16, 24
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W24_H4 2
INIT_XMM sse2
@@ -3354,7 +3354,7 @@
BLOCKCOPY_SS_W24_H4 24, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W32_H4 2
INIT_XMM sse2
@@ -3422,7 +3422,7 @@
BLOCKCOPY_SS_W32_H4 32, 48
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W48_H2 2
INIT_XMM sse2
@@ -3500,11 +3500,11 @@
BLOCKCOPY_SS_W48_H2 48, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W64_H4 2
INIT_XMM sse2
-cglobal blockcopy_ss_%1x%2, 4, 5, 6, dest, deststride, src, srcstride
+cglobal blockcopy_ss_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
mov r4d, %2/4
add r1, r1
add r3, r3
@@ -3606,11 +3606,11 @@
BLOCKCOPY_SS_W64_H4 64, 64
;-----------------------------------------------------------------------------
-; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_SS_W64_H4_avx 2
INIT_YMM avx
-cglobal blockcopy_ss_%1x%2, 4, 7, 4, dest, deststride, src, srcstride
+cglobal blockcopy_ss_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
mov r4d, %2/4
add r1, r1
add r3, r3
@@ -3670,152 +3670,82 @@
BLOCKCOPY_SS_W64_H4_avx 64, 64
;--------------------------------------------------------------------------------------
-; void copy16to16_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal copy16to16_shl, 5, 6, 2, dst, src, stride, shift, size
-%define shift m1
-
- ; make shift
- movd shift, r3d
+cglobal cpy2Dto1D_shr_4, 3, 4, 4
+ add r2d, r2d
+ movd m0, r3m
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride
- ; r4 - size
-
- sub r2d, r4d
- add r2d, r2d
- mov r5d, r4d
- shr r4d, 2
-.loop_row:
- mov r3d, r4d
-
-.loop_col:
- movh m0, [r1]
- psllw m0, shift
- movh [r0], m0
-
- add r1, 8
- add r0, 8
-
- dec r3d
- jnz .loop_col
-
- add r1, r2
- dec r5d
- jnz .loop_row
+ ; r2 - srcStride
+ ; m0 - shift
+ ; m1 - word [-round]
+
+ ; Row 0-3
+ movh m2, [r1]
+ movhps m2, [r1 + r2]
+ lea r1, [r1 + r2 * 2]
+ movh m3, [r1]
+ movhps m3, [r1 + r2]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
RET
;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_4, 3,3,3
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_8, 3, 5, 4
add r2d, r2d
movd m0, r3m
- movd m1, r4m
- pshufd m1, m1, 0
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
+ mov r3d, 8/4
+ lea r4, [r2 * 3]
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride
- ; m0 - shift
- ; m1 - dword [offset]
-
- ; Row 0
- pmovsxwd m2, [r1]
- paddd m2, m1
- psrad m2, m0
- movu [r0 + 0 * mmsize], m2
-
- ; Row 1
- pmovsxwd m2, [r1 + r2]
- paddd m2, m1
- psrad m2, m0
- movu [r0 + 1 * mmsize], m2
-
- ; Row 2
- lea r1, [r1 + r2 * 2]
- pmovsxwd m2, [r1]
- paddd m2, m1
- psrad m2, m0
- movu [r0 + 2 * mmsize], m2
-
- ; Row 3
- pmovsxwd m2, [r1 + r2]
- paddd m2, m1
- psrad m2, m0
- movu [r0 + 3 * mmsize], m2
- RET
-
-
-;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
-;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_8, 3,5,3
- add r2d, r2d
- movd m0, r3m
- movd m1, r4m
- pshufd m1, m1, 0
- mov r3d, 8/4
- lea r4, [r2 * 3]
-
- ; register alloc
- ; r0 - dst
- ; r1 - src
- ; r2 - stride
+ ; r2 - srcStride
; r3 - loop counter
; r4 - stride * 3
; m0 - shift
- ; m1 - dword [offset]
+ ; m1 - word [-round]
.loop:
- ; Row 0
- pmovsxwd m2, [r1]
- pmovsxwd m3, [r1 + mmsize/2]
- paddd m2, m1
- paddd m3, m1
- psrad m2, m0
- psrad m3, m0
- movu [r0 + 0 * mmsize], m2
- movu [r0 + 1 * mmsize], m3
-
- ; Row 1
- pmovsxwd m2, [r1 + r2]
- pmovsxwd m3, [r1 + r2 + mmsize/2]
- paddd m2, m1
- paddd m3, m1
- psrad m2, m0
- psrad m3, m0
- movu [r0 + 2 * mmsize], m2
- movu [r0 + 3 * mmsize], m3
-
- ; Row 2
- pmovsxwd m2, [r1 + r2 * 2]
- pmovsxwd m3, [r1 + r2 * 2 + mmsize/2]
- paddd m2, m1
- paddd m3, m1
- psrad m2, m0
- psrad m3, m0
- movu [r0 + 4 * mmsize], m2
- movu [r0 + 5 * mmsize], m3
-
- ; Row 3
- pmovsxwd m2, [r1 + r4]
- pmovsxwd m3, [r1 + r4 + mmsize/2]
- paddd m2, m1
- paddd m3, m1
- psrad m2, m0
- psrad m3, m0
- movu [r0 + 6 * mmsize], m2
- movu [r0 + 7 * mmsize], m3
-
- add r0, 8 * mmsize
+ ; Row 0-1
+ mova m2, [r1]
+ mova m3, [r1 + r2]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
+
+ ; Row 2-3
+ mova m2, [r1 + r2 * 2]
+ mova m3, [r1 + r4]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 2 * mmsize], m2
+ mova [r0 + 3 * mmsize], m3
+
+ add r0, 4 * mmsize
lea r1, [r1 + r2 * 4]
dec r3d
jnz .loop
@@ -3823,62 +3753,47 @@
;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_16, 3,4,6
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_16, 3, 4, 4
add r2d, r2d
movd m0, r3m
- movd m1, r4m
- pshufd m1, m1, 0
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
mov r3d, 16/2
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride
+ ; r2 - srcStride
; r3 - loop counter
; m0 - shift
- ; m1 - dword [offset]
+ ; m1 - word [-round]
.loop:
; Row 0
- pmovsxwd m2, [r1 + 0 * mmsize/2]
- pmovsxwd m3, [r1 + 1 * mmsize/2]
- pmovsxwd m4, [r1 + 2 * mmsize/2]
- pmovsxwd m5, [r1 + 3 * mmsize/2]
- paddd m2, m1
- paddd m3, m1
- paddd m4, m1
- paddd m5, m1
- psrad m2, m0
- psrad m3, m0
- psrad m4, m0
- psrad m5, m0
- movu [r0 + 0 * mmsize], m2
- movu [r0 + 1 * mmsize], m3
- movu [r0 + 2 * mmsize], m4
- movu [r0 + 3 * mmsize], m5
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
; Row 1
- pmovsxwd m2, [r1 + r2 + 0 * mmsize/2]
- pmovsxwd m3, [r1 + r2 +1 * mmsize/2]
- pmovsxwd m4, [r1 + r2 +2 * mmsize/2]
- pmovsxwd m5, [r1 + r2 +3 * mmsize/2]
- paddd m2, m1
- paddd m3, m1
- paddd m4, m1
- paddd m5, m1
- psrad m2, m0
- psrad m3, m0
- psrad m4, m0
- psrad m5, m0
- movu [r0 + 4 * mmsize], m2
- movu [r0 + 5 * mmsize], m3
- movu [r0 + 6 * mmsize], m4
- movu [r0 + 7 * mmsize], m5
-
- add r0, 8 * mmsize
+ mova m2, [r1 + r2 + 0 * mmsize]
+ mova m3, [r1 + r2 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ mova [r0 + 2 * mmsize], m2
+ mova [r0 + 3 * mmsize], m3
+
+ add r0, 4 * mmsize
lea r1, [r1 + r2 * 2]
dec r3d
jnz .loop
@@ -3886,61 +3801,45 @@
;--------------------------------------------------------------------------------------
-; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
+; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal cvt16to32_shr_32, 3,4,6
+INIT_XMM sse2
+cglobal cpy2Dto1D_shr_32, 3, 4, 6
add r2d, r2d
movd m0, r3m
- movd m1, r4m
- pshufd m1, m1, 0
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
mov r3d, 32/1
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride
+ ; r2 - srcStride
; r3 - loop counter
; m0 - shift
- ; m1 - dword [offset]
+ ; m1 - word [-round]
.loop:
; Row 0
- pmovsxwd m2, [r1 + 0 * mmsize/2]
- pmovsxwd m3, [r1 + 1 * mmsize/2]
- pmovsxwd m4, [r1 + 2 * mmsize/2]
- pmovsxwd m5, [r1 + 3 * mmsize/2]
- paddd m2, m1
- paddd m3, m1
- paddd m4, m1
- paddd m5, m1
- psrad m2, m0
- psrad m3, m0
- psrad m4, m0
- psrad m5, m0
- movu [r0 + 0 * mmsize], m2
- movu [r0 + 1 * mmsize], m3
- movu [r0 + 2 * mmsize], m4
- movu [r0 + 3 * mmsize], m5
-
- pmovsxwd m2, [r1 + 4 * mmsize/2]
- pmovsxwd m3, [r1 + 5 * mmsize/2]
- pmovsxwd m4, [r1 + 6 * mmsize/2]
- pmovsxwd m5, [r1 + 7 * mmsize/2]
- paddd m2, m1
- paddd m3, m1
- paddd m4, m1
- paddd m5, m1
- psrad m2, m0
- psrad m3, m0
- psrad m4, m0
- psrad m5, m0
- movu [r0 + 4 * mmsize], m2
- movu [r0 + 5 * mmsize], m3
- movu [r0 + 6 * mmsize], m4
- movu [r0 + 7 * mmsize], m5
-
- add r0, 8 * mmsize
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
+ mova [r0 + 2 * mmsize], m4
+ mova [r0 + 3 * mmsize], m5
+
+ add r0, 4 * mmsize
add r1, r2
dec r3d
jnz .loop
@@ -3948,58 +3847,239 @@
;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal cvt32to16_shl_4, 3,3,5
+cglobal cpy1Dto2D_shl_4, 3, 3, 3
add r2d, r2d
movd m0, r3m
; Row 0-3
+ mova m1, [r1 + 0 * mmsize]
+ mova m2, [r1 + 1 * mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ movh [r0], m1
+ movhps [r0 + r2], m1
+ movh [r0 + r2 * 2], m2
+ lea r2, [r2 * 3]
+ movhps [r0 + r2], m2
+ RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shl_4, 3, 3, 2
+ add r2d, r2d
+ movd xm0, r3m
+
+ ; Row 0-3
+ movu m1, [r1]
+ psllw m1, xm0
+ vextracti128 xm0, m1, 1
+ movq [r0], xm1
+ movhps [r0 + r2], xm1
+ lea r0, [r0 + r2 * 2]
+ movq [r0], xm0
+ movhps [r0 + r2], xm0
+ RET
+
+
+;--------------------------------------------------------------------------------------
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy1Dto2D_shl_8, 3, 4, 5
+ add r2d, r2d
+ movd m0, r3m
+ lea r3, [r2 * 3]
+
+ ; Row 0-3
+ mova m1, [r1 + 0 * mmsize]
+ mova m2, [r1 + 1 * mmsize]
+ mova m3, [r1 + 2 * mmsize]
+ mova m4, [r1 + 3 * mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ psllw m3, m0
+ psllw m4, m0
+ mova [r0], m1
+ mova [r0 + r2], m2
+ mova [r0 + r2 * 2], m3
+ mova [r0 + r3], m4
+ lea r0, [r0 + r2 * 4]
+
+ ; Row 4-7
+ mova m1, [r1 + 4 * mmsize]
+ mova m2, [r1 + 5 * mmsize]
+ mova m3, [r1 + 6 * mmsize]
+ mova m4, [r1 + 7 * mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ psllw m3, m0
+ psllw m4, m0
+ mova [r0], m1
+ mova [r0 + r2], m2
+ mova [r0 + r2 * 2], m3
+ mova [r0 + r3], m4
+ RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shl_8, 3, 4, 3
+ add r2d, r2d
+ movd xm0, r3m
+ lea r3, [r2 * 3]
+
+ ; Row 0-3
movu m1, [r1 + 0 * mmsize]
movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
+ psllw m1, xm0
+ psllw m2, xm0
+ movu [r0], xm1
+ vextracti128 [r0 + r2], m1, 1
+ movu [r0 + r2 * 2], xm2
+ vextracti128 [r0 + r3], m2, 1
+
+ ; Row 4-7
+ movu m1, [r1 + 2 * mmsize]
+ movu m2, [r1 + 3 * mmsize]
+ lea r0, [r0 + r2 * 4]
+ psllw m1, xm0
+ psllw m2, xm0
+ movu [r0], xm1
+ vextracti128 [r0 + r2], m1, 1
+ movu [r0 + r2 * 2], xm2
+ vextracti128 [r0 + r3], m2, 1
+ RET
+
+
+;--------------------------------------------------------------------------------------
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy1Dto2D_shl_16, 3, 4, 5
+ add r2d, r2d
+ movd m0, r3m
+ mov r3d, 16/4
+
+.loop:
+ ; Row 0-1
+ mova m1, [r1 + 0 * mmsize]
+ mova m2, [r1 + 1 * mmsize]
+ mova m3, [r1 + 2 * mmsize]
+ mova m4, [r1 + 3 * mmsize]
psllw m1, m0
+ psllw m2, m0
psllw m3, m0
- movh [r0], m1
- movhps [r0 + r2], m1
- movh [r0 + r2 * 2], m3
- lea r2, [r2 * 3]
- movhps [r0 + r2], m3
+ psllw m4, m0
+ mova [r0], m1
+ mova [r0 + 16], m2
+ mova [r0 + r2], m3
+ mova [r0 + r2 + 16], m4
+
+ ; Row 2-3
+ mova m1, [r1 + 4 * mmsize]
+ mova m2, [r1 + 5 * mmsize]
+ mova m3, [r1 + 6 * mmsize]
+ mova m4, [r1 + 7 * mmsize]
+ lea r0, [r0 + r2 * 2]
+ psllw m1, m0
+ psllw m2, m0
+ psllw m3, m0
+ psllw m4, m0
+ mova [r0], m1
+ mova [r0 + 16], m2
+ mova [r0 + r2], m3
+ mova [r0 + r2 + 16], m4
+
+ add r1, 8 * mmsize
+ lea r0, [r0 + r2 * 2]
+ dec r3d
+ jnz .loop
RET
INIT_YMM avx2
-cglobal cvt32to16_shl_4, 3,3,3
+cglobal cpy1Dto2D_shl_16, 3, 5, 3
add r2d, r2d
movd xm0, r3m
-
- ; Row 0-3
+ mov r3d, 16/4
+ lea r4, [r2 * 3]
+
+.loop:
+ ; Row 0-1
movu m1, [r1 + 0 * mmsize]
movu m2, [r1 + 1 * mmsize]
- packssdw m1, m2
psllw m1, xm0
- vextracti128 xm0, m1, 1
- movq [r0], xm1
- movq [r0 + r2], xm0
- lea r0, [r0 + r2 * 2]
- movhps [r0], xm1
- movhps [r0 + r2], xm0
+ psllw m2, xm0
+ movu [r0], m1
+ movu [r0 + r2], m2
+
+ ; Row 2-3
+ movu m1, [r1 + 2 * mmsize]
+ movu m2, [r1 + 3 * mmsize]
+ psllw m1, xm0
+ psllw m2, xm0
+ movu [r0 + r2 * 2], m1
+ movu [r0 + r4], m2
+
+ add r1, 4 * mmsize
+ lea r0, [r0 + r2 * 4]
+ dec r3d
+ jnz .loop
RET
;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal cvt32to16_shl_8, 3,5,5
+cglobal cpy1Dto2D_shl_32, 3, 4, 5
add r2d, r2d
movd m0, r3m
- mov r3d, 8/4
- lea r4, [r2 * 3]
+ mov r3d, 32/2
+
+.loop:
+ ; Row 0
+ mova m1, [r1 + 0 * mmsize]
+ mova m2, [r1 + 1 * mmsize]
+ mova m3, [r1 + 2 * mmsize]
+ mova m4, [r1 + 3 * mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ psllw m3, m0
+ psllw m4, m0
+ mova [r0 + 0 * mmsize], m1
+ mova [r0 + 1 * mmsize], m2
+ mova [r0 + 2 * mmsize], m3
+ mova [r0 + 3 * mmsize], m4
+
+ ; Row 1
+ mova m1, [r1 + 4 * mmsize]
+ mova m2, [r1 + 5 * mmsize]
+ mova m3, [r1 + 6 * mmsize]
+ mova m4, [r1 + 7 * mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ psllw m3, m0
+ psllw m4, m0
+ mova [r0 + r2 + 0 * mmsize], m1
+ mova [r0 + r2 + 1 * mmsize], m2
+ mova [r0 + r2 + 2 * mmsize], m3
+ mova [r0 + r2 + 3 * mmsize], m4
+
+ add r1, 8 * mmsize
+ lea r0, [r0 + r2 * 2]
+ dec r3d
+ jnz .loop
+ RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shl_32, 3, 4, 5
+ add r2d, r2d
+ movd xm0, r3m
+ mov r3d, 32/2
.loop:
; Row 0-1
@@ -4007,252 +4087,14 @@
movu m2, [r1 + 1 * mmsize]
movu m3, [r1 + 2 * mmsize]
movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
- psllw m1, m0
- psllw m3, m0
+ psllw m1, xm0
+ psllw m2, xm0
+ psllw m3, xm0
+ psllw m4, xm0
movu [r0], m1
+ movu [r0 + mmsize], m2
movu [r0 + r2], m3
-
- ; Row 2-3
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
- psllw m1, m0
- psllw m3, m0
- movu [r0 + r2 * 2], m1
- movu [r0 + r4], m3
-
- add r1, 8 * mmsize
- lea r0, [r0 + r2 * 4]
- dec r3d
- jnz .loop
- RET
-
-
-INIT_YMM avx2
-cglobal cvt32to16_shl_8, 3,4,3
- add r2d, r2d
- movd xm0, r3m
- lea r3, [r2 * 3]
-
- ; Row 0-1
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
- packssdw m1, m2
- psllw m1, xm0
- movu [r0], xm1
- vextracti128 [r0 + r2], m1, 1
-
- ; Row 2-3
- movu xm1, [r1 + 2 * mmsize]
- vinserti128 m1, m1, [r1 + 3 * mmsize], 1
- movu xm2, [r1 + 2 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
- packssdw m1, m2
- psllw m1, xm0
- movu [r0 + r2 * 2], xm1
- vextracti128 [r0 + r3], m1, 1
-
- add r1, 4 * mmsize
- lea r0, [r0 + r2 * 4]
-
- ; Row 4-5
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- packssdw m1, m2
- vpermq m1, m1, 11011000b
- psllw m1, xm0
- movu [r0], xm1
- vextracti128 [r0 + r2], m1, 1
-
- ; Row 6-7
- movu m1, [r1 + 2 * mmsize]
- movu m2, [r1 + 3 * mmsize]
- packssdw m1, m2
- vpermq m1, m1, 11011000b
- psllw m1, xm0
- movu [r0 + r2 * 2], xm1
- vextracti128 [r0 + r3], m1, 1
- RET
-
-;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
-;--------------------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal cvt32to16_shl_16, 3,4,5
- add r2d, r2d
- movd m0, r3m
- mov r3d, 16/2
-
-.loop:
- ; Row 0
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
- psllw m1, m0
- psllw m3, m0
- movu [r0], m1
- movu [r0 + mmsize], m3
-
- ; Row 1
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
- psllw m1, m0
- psllw m3, m0
- movu [r0 + r2], m1
- movu [r0 + r2 + mmsize], m3
-
- add r1, 8 * mmsize
- lea r0, [r0 + r2 * 2]
- dec r3d
- jnz .loop
- RET
-
-
-INIT_YMM avx2
-cglobal cvt32to16_shl_16, 3,5,3
- add r2d, r2d
- movd xm0, r3m
- mov r3d, 16/4
- lea r4, [r2 * 3]
-
-.loop:
- ; Row 0
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
- packssdw m1, m2
- psllw m1, xm0
- movu [r0], m1
-
- ; Row 1
- movu xm1, [r1 + 2 * mmsize]
- vinserti128 m1, m1, [r1 + 3 * mmsize], 1
- movu xm2, [r1 + 2 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
- packssdw m1, m2
- psllw m1, xm0
- movu [r0 + r2], m1
-
- add r1, 4 * mmsize
-
- ; Row 2
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
- packssdw m1, m2
- psllw m1, xm0
- movu [r0 + r2 * 2], m1
-
- ; Row 3
- movu m1, [r1 + 2 * mmsize]
- movu m2, [r1 + 3 * mmsize]
- packssdw m1, m2
- psllw m1, xm0
- vpermq m1, m1, 11011000b
- movu [r0 + r4], m1
-
- add r1, 4 * mmsize
- lea r0, [r0 + r2 * 4]
- dec r3d
- jnz .loop
- RET
-
-
-;--------------------------------------------------------------------------------------
-; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
-;--------------------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal cvt32to16_shl_32, 3,4,5
- add r2d, r2d
- movd m0, r3m
- mov r3d, 32/1
-
-.loop:
- ; Row 0
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
- psllw m1, m0
- psllw m3, m0
- movu [r0 + 0 * mmsize], m1
- movu [r0 + 1 * mmsize], m3
-
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
- psllw m1, m0
- psllw m3, m0
- movu [r0 + 2 * mmsize], m1
- movu [r0 + 3 * mmsize], m3
-
- add r1, 8 * mmsize
- add r0, r2
- dec r3d
- jnz .loop
- RET
-
-
-INIT_YMM avx2
-cglobal cvt32to16_shl_32, 3,4,5
- add r2d, r2d
- movd xm0, r3m
- mov r3d, 32/2
-
-.loop:
- ; Row 0
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
- movu xm3, [r1 + 2 * mmsize]
- vinserti128 m3, m3, [r1 + 3 * mmsize], 1
- movu xm4, [r1 + 2 * mmsize + mmsize/2]
- vinserti128 m4, m4, [r1 + 3 * mmsize + mmsize/2], 1
- packssdw m1, m2
- packssdw m3, m4
- psllw m1, xm0
- psllw m3, xm0
- movu [r0], m1
- movu [r0 + mmsize], m3
-
- add r1, 4 * mmsize
-
- ; Row 1
- movu xm1, [r1 + 0 * mmsize]
- vinserti128 m1, m1, [r1 + 1 * mmsize], 1
- movu xm2, [r1 + 0 * mmsize + mmsize/2]
- vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- packssdw m1, m2
- packssdw m3, m4
- psllw m1, xm0
- psllw m3, xm0
- vpermq m3, m3, 11011000b
- movu [r0 + r2], m1
- movu [r0 + r2 + mmsize], m3
+ movu [r0 + r2 + mmsize], m4
add r1, 4 * mmsize
lea r0, [r0 + r2 * 2]
@@ -4262,7 +4104,7 @@
;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal copy_cnt_4, 3,3,3
@@ -4301,7 +4143,7 @@
;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal copy_cnt_8, 3,3,6
@@ -4405,7 +4247,7 @@
;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal copy_cnt_16, 3,4,6
@@ -4516,7 +4358,7 @@
RET
;--------------------------------------------------------------------------------------
-; uint32_t copy_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal copy_cnt_32, 3,4,6
@@ -4623,180 +4465,432 @@
movd eax, xm4
RET
-;-----------------------------------------------------------------------------
-; void copy_shr(short *dst, short *src, intptr_t stride, int shift, int size)
-;-----------------------------------------------------------------------------
-
-INIT_XMM sse4
-cglobal copy_shr, 4, 7, 4, dst, src, stride
-%define rnd m2
-%define shift m1
-
- ; make shift
- mov r5d, r3m
- movd shift, r5d
-
- ; make round
- dec r5
- xor r6, r6
- bts r6, r5
-
- movd rnd, r6d
- pshufd rnd, rnd, 0
+
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_4, 4, 4, 4
+ add r2d, r2d
+ movd m0, r3d
; register alloc
; r0 - dst
; r1 - src
- ; r2 - stride * 2 (short*)
- ; r3 - lx
- ; r4 - size
- ; r5 - ly
- ; r6 - diff
- add r2d, r2d
-
- mov r4d, r4m
- mov r5, r4 ; size
- mov r6, r2 ; stride
- sub r6, r4
- add r6, r6
-
- shr r5, 1
-.loop_row:
-
- mov r3, r4
- shr r3, 2
-.loop_col:
- ; row 0
- movh m3, [r1]
- pmovsxwd m0, m3
- paddd m0, rnd
- psrad m0, shift
- packssdw m0, m0
- movh [r0], m0
-
- ; row 1
- movh m3, [r1 + r4 * 2]
- pmovsxwd m0, m3
- paddd m0, rnd
- psrad m0, shift
- packssdw m0, m0
- movh [r0 + r2], m0
-
- ; move col pointer
- add r1, 8
- add r0, 8
-
- dec r3
- jg .loop_col
-
- ; update pointer
- lea r1, [r1 + r4 * 2]
- add r0, r6
-
- ; end of loop_row
- dec r5
- jg .loop_row
+ ; r2 - srcStride
+ ; m0 - shift
+
+ ; Row 0-3
+ movh m2, [r1]
+ movhps m2, [r1 + r2]
+ lea r1, [r1 + r2 * 2]
+ movh m3, [r1]
+ movhps m3, [r1 + r2]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
RET
+
;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal copy_shl_4, 3,3,3
+cglobal cpy2Dto1D_shl_8, 4, 5, 4
+ add r2d, r2d
+ movd m0, r3d
+ mov r3d, 8/4
+ lea r4, [r2 * 3]
+
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - srcStride
+ ; r3 - loop counter
+ ; r4 - stride * 3
+ ; m0 - shift
+
+.loop:
+ ; Row 0, 1
+ mova m2, [r1]
+ mova m3, [r1 + r2]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
+
+ ; Row 2, 3
+ mova m2, [r1 + r2 * 2]
+ mova m3, [r1 + r4]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 2 * mmsize], m2
+ mova [r0 + 3 * mmsize], m3
+
+ add r0, 4 * mmsize
+ lea r1, [r1 + r2 * 4]
+ dec r3d
+ jnz .loop
+ RET
+
+
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_16, 4, 4, 4
+ add r2d, r2d
+ movd m0, r3d
+ mov r3d, 16/2
+
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - srcStride
+ ; r3 - loop counter
+ ; m0 - shift
+
+.loop:
+ ; Row 0
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
+
+ ; Row 1
+ mova m2, [r1 + r2 + 0 * mmsize]
+ mova m3, [r1 + r2 + 1 * mmsize]
+ psllw m2, m0
+ psllw m3, m0
+ mova [r0 + 2 * mmsize], m2
+ mova [r0 + 3 * mmsize], m3
+
+ add r0, 4 * mmsize
+ lea r1, [r1 + r2 * 2]
+ dec r3d
+ jnz .loop
+ RET
+
+
+;--------------------------------------------------------------------------------------
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy2Dto1D_shl_32, 4, 4, 6
+ add r2d, r2d
+ movd m0, r3d
+ mov r3d, 32/1
+
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - srcStride
+ ; r3 - loop counter
+ ; m0 - shift
+
+.loop:
+ ; Row 0
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psllw m2, m0
+ psllw m3, m0
+ psllw m4, m0
+ psllw m5, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
+ mova [r0 + 2 * mmsize], m4
+ mova [r0 + 3 * mmsize], m5
+
+ add r0, 4 * mmsize
+ add r1, r2
+ dec r3d
+ jnz .loop
+ RET
+
+
+;--------------------------------------------------------------------------------------
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy1Dto2D_shr_4, 3, 3, 4
add r2d, r2d
movd m0, r3m
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- psllw m1, m0
- psllw m2, m0
- movh [r0], m1
- movhps [r0 + r2], m1
- movh [r0 + r2 * 2], m2
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, m0
+ psraw m3, m0
+ movh [r0], m2
+ movhps [r0 + r2], m2
+ movh [r0 + r2 * 2], m3
lea r2, [r2 * 3]
- movhps [r0 + r2], m2
+ movhps [r0 + r2], m3
RET
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_4, 3, 3, 3
+ add r2d, r2d
+ movd xm0, r3m
+ pcmpeqw m1, m1
+ psllw m1, xm0
+ psraw m1, 1
+
+ ; Row 0-3
+ movu m2, [r1]
+ psubw m2, m1
+ psraw m2, xm0
+ vextracti128 xm1, m2, 1
+ movq [r0], xm2
+ movhps [r0 + r2], xm2
+ lea r0, [r0 + r2 * 2]
+ movq [r0], xm1
+ movhps [r0 + r2], xm1
+ RET
+
+
;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal copy_shl_8, 3,4,5
+cglobal cpy1Dto2D_shr_8, 3, 4, 6
add r2d, r2d
movd m0, r3m
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
+ lea r3, [r2 * 3]
; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0], m1
- movu [r0 + r2], m2
- movu [r0 + 2 * r2], m3
- lea r0, [r0 + 2 * r2]
- movu [r0 + r2], m4
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0], m2
+ mova [r0 + r2], m3
+ mova [r0 + r2 * 2], m4
+ mova [r0 + r3], m5
; Row 4-7
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0 + r2 * 2], m1
- lea r0, [r0 + 2 * r2]
- movu [r0 + r2], m2
- movu [r0 + 2 * r2], m3
- lea r0, [r0 + 2 * r2]
- movu [r0 + r2], m4
+ mova m2, [r1 + 4 * mmsize]
+ mova m3, [r1 + 5 * mmsize]
+ mova m4, [r1 + 6 * mmsize]
+ mova m5, [r1 + 7 * mmsize]
+ lea r0, [r0 + r2 * 4]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0], m2
+ mova [r0 + r2], m3
+ mova [r0 + r2 * 2], m4
+ mova [r0 + r3], m5
RET
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_8, 3, 4, 4
+ add r2d, r2d
+ movd xm0, r3m
+ pcmpeqw m1, m1
+ psllw m1, xm0
+ psraw m1, 1
+ lea r3, [r2 * 3]
+
+ ; Row 0-3
+ movu m2, [r1 + 0 * mmsize]
+ movu m3, [r1 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0], xm2
+ vextracti128 [r0 + r2], m2, 1
+ movu [r0 + r2 * 2], xm3
+ vextracti128 [r0 + r3], m3, 1
+
+ ; Row 4-7
+ movu m2, [r1 + 2 * mmsize]
+ movu m3, [r1 + 3 * mmsize]
+ lea r0, [r0 + r2 * 4]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0], xm2
+ vextracti128 [r0 + r2], m2, 1
+ movu [r0 + r2 * 2], xm3
+ vextracti128 [r0 + r3], m3, 1
+ RET
+
+
;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
;--------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal copy_shl_16, 3,4,5
+cglobal cpy1Dto2D_shr_16, 3, 5, 6
add r2d, r2d
movd m0, r3m
- mov r3d, 256/64
+ pcmpeqw m1, m1
+ psllw m1, m0
+ psraw m1, 1
+ mov r3d, 16/4
+ lea r4, [r2 * 3]
.loop:
- ; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
+ ; Row 0-1
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0], m2
+ mova [r0 + mmsize], m3
+ mova [r0 + r2], m4
+ mova [r0 + r2 + mmsize], m5
+
+ ; Row 2-3
+ mova m2, [r1 + 4 * mmsize]
+ mova m3, [r1 + 5 * mmsize]
+ mova m4, [r1 + 6 * mmsize]
+ mova m5, [r1 + 7 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0 + r2 * 2], m2
+ mova [r0 + r2 * 2 + mmsize], m3
+ mova [r0 + r4], m4
+ mova [r0 + r4 + mmsize], m5
+
+ add r1, 8 * mmsize
+ lea r0, [r0 + r2 * 4]
+ dec r3d
+ jnz .loop
+ RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_16, 3, 5, 4
+ add r2d, r2d
+ movd xm0, r3m
+ pcmpeqw m1, m1
+ psllw m1, xm0
+ psraw m1, 1
+ mov r3d, 16/4
+ lea r4, [r2 * 3]
+
+.loop:
+ ; Row 0-1
+ movu m2, [r1 + 0 * mmsize]
+ movu m3, [r1 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0], m2
+ movu [r0 + r2], m3
+
+ ; Row 2-3
+ movu m2, [r1 + 2 * mmsize]
+ movu m3, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0 + r2 * 2], m2
+ movu [r0 + r4], m3
+
+ add r1, 4 * mmsize
+ lea r0, [r0 + r2 * 4]
+ dec r3d
+ jnz .loop
+ RET
+
+
+;--------------------------------------------------------------------------------------
+; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cpy1Dto2D_shr_32, 3, 4, 6
+ add r2d, r2d
+ movd m0, r3m
+ pcmpeqw m1, m1
psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0], m1
- movu [r0 + 16], m2
- movu [r0 + r2], m3
- movu [r0 + r2 + 16], m4
-
- ; Row 4-7
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0 + r2 * 2], m1
- movu [r0 + r2 * 2 + 16], m2
- lea r0, [r0 + r2 * 2]
- movu [r0 + r2], m3
- movu [r0 + r2 + 16], m4
+ psraw m1, 1
+ mov r3d, 32/2
+
+.loop:
+ ; Row 0
+ mova m2, [r1 + 0 * mmsize]
+ mova m3, [r1 + 1 * mmsize]
+ mova m4, [r1 + 2 * mmsize]
+ mova m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0 + 0 * mmsize], m2
+ mova [r0 + 1 * mmsize], m3
+ mova [r0 + 2 * mmsize], m4
+ mova [r0 + 3 * mmsize], m5
+
+ ; Row 1
+ mova m2, [r1 + 4 * mmsize]
+ mova m3, [r1 + 5 * mmsize]
+ mova m4, [r1 + 6 * mmsize]
+ mova m5, [r1 + 7 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0 + r2 + 0 * mmsize], m2
+ mova [r0 + r2 + 1 * mmsize], m3
+ mova [r0 + r2 + 2 * mmsize], m4
+ mova [r0 + r2 + 3 * mmsize], m5
add r1, 8 * mmsize
lea r0, [r0 + r2 * 2]
@@ -4804,45 +4898,36 @@
jnz .loop
RET
-;--------------------------------------------------------------------------------------
-; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
-;--------------------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal copy_shl_32, 3,4,5
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_32, 3, 4, 6
add r2d, r2d
- movd m0, r3m
- mov r3d, 1024/64
+ movd xm0, r3m
+ pcmpeqw m1, m1
+ psllw m1, xm0
+ psraw m1, 1
+ mov r3d, 32/2
.loop:
- ; Row 0-3
- movu m1, [r1 + 0 * mmsize]
- movu m2, [r1 + 1 * mmsize]
- movu m3, [r1 + 2 * mmsize]
- movu m4, [r1 + 3 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0], m1
- movu [r0 + 16], m2
- movu [r0 + 32], m3
- movu [r0 + 48], m4
-
- ; Row 4-7
- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0 + r2], m1
- movu [r0 + r2 + 16], m2
- movu [r0 + r2 + 32], m3
- movu [r0 + r2 + 48], m4
-
- add r1, 8 * mmsize
+ ; Row 0-1
+ movu m2, [r1 + 0 * mmsize]
+ movu m3, [r1 + 1 * mmsize]
+ movu m4, [r1 + 2 * mmsize]
+ movu m5, [r1 + 3 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ psraw m4, xm0
+ psraw m5, xm0
+ movu [r0], m2
+ movu [r0 + mmsize], m3
+ movu [r0 + r2], m4
+ movu [r0 + r2 + mmsize], m5
+
+ add r1, 4 * mmsize
lea r0, [r0 + r2 * 2]
dec r3d
jnz .loop
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/x86/blockcopy8.h Thu Nov 27 10:12:03 2014 +0900
@@ -24,32 +24,38 @@
#ifndef X265_BLOCKCOPY8_H
#define X265_BLOCKCOPY8_H
-void x265_cvt32to16_shl_4_sse2(int16_t* dst, const int* src, intptr_t, int);
-void x265_cvt32to16_shl_8_sse2(int16_t* dst, const int* src, intptr_t, int);
-void x265_cvt32to16_shl_16_sse2(int16_t* dst, const int* src, intptr_t, int);
-void x265_cvt32to16_shl_32_sse2(int16_t* dst, const int* src, intptr_t, int);
-void x265_cvt32to16_shl_4_avx2(int16_t* dst, const int* src, intptr_t, int);
-void x265_cvt32to16_shl_8_avx2(int16_t* dst, const int* src, intptr_t, int);
-void x265_cvt32to16_shl_16_avx2(int16_t* dst, const int* src, intptr_t, int);
-void x265_cvt32to16_shl_32_avx2(int16_t* dst, const int* src, intptr_t, int);
-void x265_copy16to16_shl_sse2(int16_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_4_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_8_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_16_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
-void x265_cvt16to32_shr_32_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
-void x265_copy_shr_sse4(int16_t* dst, const int16_t* src, intptr_t, int, int);
-void x265_copy_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
-void x265_copy_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
-void x265_copy_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
-void x265_copy_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t, int);
-uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t);
-uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t);
-uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t);
-uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t);
-uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t);
-uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t);
-uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t);
-uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t);
+void x265_cpy2Dto1D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy1Dto2D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
+uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
+uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
#define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
@@ -181,17 +187,17 @@
void x265_blockfill_s_8x8_sse2(int16_t* dst, intptr_t dstride, int16_t val);
void x265_blockfill_s_16x16_sse2(int16_t* dst, intptr_t dstride, int16_t val);
void x265_blockfill_s_32x32_sse2(int16_t* dst, intptr_t dstride, int16_t val);
-void x265_blockcopy_ss_16x4_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_16x8_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_16x12_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_16x16_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_16x24_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_16x32_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_16x64_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_64x16_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_64x32_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_64x48_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
-void x265_blockcopy_ss_64x64_avx(int16_t* dest, intptr_t deststride, const int16_t* src, intptr_t srcstride);
+void x265_blockcopy_ss_16x4_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x12_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_16x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
+void x265_blockcopy_ss_64x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
void x265_blockcopy_pp_32x8_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
void x265_blockcopy_pp_32x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/x86/dct8.asm Thu Nov 27 10:12:03 2014 +0900
@@ -318,7 +318,7 @@
cextern pw_ppppmmmm
;------------------------------------------------------
-;void dct4(int16_t *src, int16_t *dst, intptr_t stride)
+;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
INIT_XMM sse2
cglobal dct4, 3, 4, 8
@@ -475,7 +475,7 @@
RET
;-------------------------------------------------------
-;void idct4(int16_t *src, int16_t *dst, intptr_t stride)
+;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM sse2
cglobal idct4, 3, 4, 7
@@ -565,7 +565,7 @@
RET
;------------------------------------------------------
-;void dst4(int16_t *src, int16_t *dst, intptr_t stride)
+;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
INIT_XMM ssse3
%if ARCH_X86_64
@@ -657,7 +657,7 @@
RET
;-------------------------------------------------------
-;void idst4(int16_t *src, int16_t *dst, intptr_t stride)
+;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM sse2
cglobal idst4, 3, 4, 7
@@ -750,7 +750,7 @@
;-------------------------------------------------------
-; void dct8(int16_t *src, int16_t *dst, intptr_t stride)
+; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
;-------------------------------------------------------
INIT_XMM sse4
cglobal dct8, 3,6,7,0-16*mmsize
@@ -974,7 +974,7 @@
RET
;-------------------------------------------------------
-; void idct8(int16_t *src, int16_t *dst, intptr_t stride)
+; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM ssse3
@@ -1164,7 +1164,7 @@
;-----------------------------------------------------------------------------
-; void denoise_dct(int16_t *dct, uint32_t *sum, uint16_t *offset, int size)
+; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal denoise_dct, 4, 4, 6
@@ -2106,7 +2106,7 @@
%endmacro
;-------------------------------------------------------
-; void idct16(int16_t *src, int16_t *dst, intptr_t stride)
+; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_YMM avx2
cglobal idct16, 3, 7, 16, 0-16*mmsize
@@ -2385,7 +2385,7 @@
%endmacro
;-------------------------------------------------------
-; void idct32(int16_t *src, int16_t *dst, intptr_t stride)
+; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
; TODO: Reduce PHADDD instruction by PADDD
@@ -2684,7 +2684,7 @@
RET
;-------------------------------------------------------
-; void idct4(int16_t *src, int16_t *dst, intptr_t stride)
+; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_YMM avx2
cglobal idct4, 3, 4, 6
diff -r dfe0803ae6be -r b4454aa1b6ab source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Wed Nov 26 16:56:00 2014 -0600
+++ b/source/common/x86/dct8.h Thu Nov 27 10:12:03 2014 +0900
@@ -23,21 +23,21 @@
#ifndef X265_DCT8_H
#define X265_DCT8_H
-void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_dct16_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_idct32_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
+void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct16_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
-void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
-void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
+void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idct32_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
void x265_denoise_dct_sse4(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
void x265_denoise_dct_avx2(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
diff -r dfe0803ae6be -r b4454aa1b6ab source/encoder/search.cpp
--- a/source/encoder/search.cpp Wed Nov 26 16:56:00 2014 -0600
+++ b/source/encoder/search.cpp Thu Nov 27 10:12:03 2014 +0900
@@ -2211,8 +2211,8 @@
if (bTryZero)
{
/* coincident blocks of the two reference pictures */
- const pixel *ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
- const pixel *ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+ const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+ const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
intptr_t refStride = slice->m_mref[0][0].lumaStride;
primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
diff -r dfe0803ae6be -r b4454aa1b6ab source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Wed Nov 26 16:56:00 2014 -0600
+++ b/source/test/pixelharness.cpp Thu Nov 27 10:12:03 2014 +0900
@@ -344,60 +344,7 @@
return true;
}
-bool PixelHarness::check_copy16to16_shl_t(cpy16to16_shl_t ref, cpy16to16_shl_t opt)
-{
- ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
- ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
-
- int j = 0;
- intptr_t stride = STRIDE;
- for (int i = 0; i < ITERS; i++)
- {
- int shift = (rand() % 7 + 1);
-
- int index = i % TEST_CASES;
- checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
- ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
-
- if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
- return false;
-
- reportfail();
- j += INCR;
- }
-
- return true;
-}
-
-bool PixelHarness::check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt)
-{
- ALIGN_VAR_16(int32_t, ref_dest[64 * 64]);
- ALIGN_VAR_16(int32_t, opt_dest[64 * 64]);
-
- memset(ref_dest, 0xCD, sizeof(ref_dest));
- memset(opt_dest, 0xCD, sizeof(opt_dest));
-
- int j = 0;
- intptr_t stride = STRIDE;
- for (int i = 0; i < ITERS; i++)
- {
- int shift = (rand() % 7 + 1);
-
- int index = i % TEST_CASES;
- checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)stride);
- ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)stride);
-
- if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
- return false;
-
- reportfail();
- j += INCR;
- }
-
- return true;
-}
-
-bool PixelHarness::check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt)
+bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)
{
ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
@@ -412,8 +359,36 @@
int shift = (rand() % 7 + 1);
int index = i % TEST_CASES;
- checked(opt, opt_dest, int_test_buff[index] + j, stride, shift);
- ref(ref_dest, int_test_buff[index] + j, stride, shift);
+ checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+ ref(ref_dest, short_test_buff[index] + j, stride, shift);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
+bool PixelHarness::check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt)
+{
+ ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
+ ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+ intptr_t stride = STRIDE;
+ for (int i = 0; i < ITERS; i++)
+ {
+ int shift = (rand() % 7 + 1);
+
+ int index = i % TEST_CASES;
+ checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+ ref(ref_dest, short_test_buff[index] + j, stride, shift);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
return false;
@@ -451,7 +426,7 @@
return true;
}
-bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt)
+bool PixelHarness::check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
{
ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
@@ -466,8 +441,8 @@
int shift = (rand() % 7 + 1);
int index = i % TEST_CASES;
- checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
- ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
+ checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+ ref(ref_dest, short_test_buff[index] + j, stride, shift);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
return false;
@@ -479,7 +454,7 @@
return true;
}
-bool PixelHarness::check_copy_shl_t(copy_shl_t ref, copy_shl_t opt)
+bool PixelHarness::check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt)
{
ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
@@ -1280,41 +1255,40 @@
}
}
- if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i])
+ if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shl[i])
{
- if (!check_cvt16to32_shr_t(ref.cvt16to32_shr[i], opt.cvt16to32_shr[i]))
+ if (!check_cpy2Dto1D_shl_t(ref.cpy2Dto1D_shl[i], opt.cpy2Dto1D_shl[i]))
{
- printf("cvt16to32_shr failed!\n");
+ printf("cpy2Dto1D_shl failed!\n");
return false;
}
}
- if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i])
+ if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shr[i])
{
- if (!check_cvt32to16_shl_t(ref.cvt32to16_shl[i], opt.cvt32to16_shl[i]))
+ if (!check_cpy2Dto1D_shr_t(ref.cpy2Dto1D_shr[i], opt.cpy2Dto1D_shr[i]))
{
- printf("cvt32to16_shl failed!\n");
+ printf("cpy2Dto1D_shr failed!\n");
return false;
}
}
- if ((i < BLOCK_64x64) && opt.copy_shl[i])
+ if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shl[i])
{
- if (!check_copy_shl_t(ref.copy_shl[i], opt.copy_shl[i]))
+ if (!check_cpy1Dto2D_shl_t(ref.cpy1Dto2D_shl[i], opt.cpy1Dto2D_shl[i]))
{
- printf("copy_shl[%dx%d] failed!\n", 4 << i, 4 << i);
+ printf("cpy1Dto2D_shl[%dx%d] failed!\n", 4 << i, 4 << i);
return false;
}
}
- }
-
- if (opt.cpy16to16_shl)
- {
- if (!check_copy16to16_shl_t(ref.cpy16to16_shl, opt.cpy16to16_shl))
+ if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shr[i])
{
- printf("copy16to16_shl failed!\n");
- return false;
+ if (!check_cpy1Dto2D_shr_t(ref.cpy1Dto2D_shr[i], opt.cpy1Dto2D_shr[i]))
+ {
+ printf("cpy1Dto2D_shr[%dx%d] failed!\n", 4 << i, 4 << i);
+ return false;
+ }
}
}
@@ -1408,15 +1382,6 @@
}
}
- if (opt.copy_shr)
- {
- if (!check_copy_shr_t(ref.copy_shr, opt.copy_shr))
- {
- printf("copy_shr failed!\n");
- return false;
- }
- }
-
return true;
}
@@ -1637,16 +1602,28 @@
REPORT_SPEEDUP(opt.var[i], ref.var[i], pbuf1, STRIDE);
}
- if ((i < BLOCK_64x64) && opt.cvt16to32_shr[i])
+ if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shl[i])
{
- HEADER("cvt16to32_shr[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.cvt16to32_shr[i], ref.cvt16to32_shr[i], ibuf1, sbuf2, STRIDE, 3, 4);
+ HEADER("cpy2Dto1D_shl[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cpy2Dto1D_shl[i], ref.cpy2Dto1D_shl[i], sbuf1, sbuf2, STRIDE, MAX_TR_DYNAMIC_RANGE - X265_DEPTH - (i + 2));
}
- if ((i < BLOCK_64x64) && opt.cvt32to16_shl[i])
+ if ((i < BLOCK_64x64) && opt.cpy2Dto1D_shr[i])
{
- HEADER("cvt32to16_shl[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.cvt32to16_shl[i], ref.cvt32to16_shl[i], sbuf2, ibuf1, STRIDE, 3);
+ HEADER("cpy2Dto1D_shr[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cpy2Dto1D_shr[i], ref.cpy2Dto1D_shr[i], sbuf1, sbuf2, STRIDE, 3);
+ }
+
+ if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shl[i])
+ {
+ HEADER("cpy1Dto2D_shl[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cpy1Dto2D_shl[i], ref.cpy1Dto2D_shl[i], sbuf1, sbuf2, STRIDE, 64);
+ }
+
+ if ((i < BLOCK_64x64) && opt.cpy1Dto2D_shr[i])
+ {
+ HEADER("cpy1Dto2D_shr[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cpy1Dto2D_shr[i], ref.cpy1Dto2D_shr[i], sbuf1, sbuf2, STRIDE, 64);
}
if ((i < BLOCK_64x64) && opt.copy_cnt[i])
@@ -1654,19 +1631,6 @@
HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i);
REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE);
}
-
- if ((i < BLOCK_64x64) && opt.copy_shl[i])
- {
- HEADER("copy_shl[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.copy_shl[i], ref.copy_shl[i], sbuf1, sbuf2, STRIDE, 64);
- }
-
- }
-
- if (opt.cpy16to16_shl)
- {
- HEADER0("cpy16to16_shl");
- REPORT_SPEEDUP(opt.cpy16to16_shl, ref.cpy16to16_shl, sbuf2, sbuf1, 64, 5, 64);
}
if (opt.weight_pp)
@@ -1728,11 +1692,4 @@
HEADER0("planecopy_cp");
REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1, 64, 64, 64, 2);
}
-
- if (opt.copy_shr)
- {
- HEADER0("copy_shr");
- REPORT_SPEEDUP(opt.copy_shr, ref.copy_shr, sbuf1, sbuf2, 64, 5, 64);
- }
-
}
diff -r dfe0803ae6be -r b4454aa1b6ab source/test/pixelharness.h
--- a/source/test/pixelharness.h Wed Nov 26 16:56:00 2014 -0600
+++ b/source/test/pixelharness.h Thu Nov 27 10:12:03 2014 +0900
@@ -80,12 +80,11 @@
bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);
bool check_weightp(weightp_sp_t ref, weightp_sp_t opt);
bool check_downscale_t(downscale_t ref, downscale_t opt);
- bool check_copy16to16_shl_t(cpy16to16_shl_t ref, cpy16to16_shl_t opt);
- bool check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt);
- bool check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt);
+ bool check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt);
+ bool check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt);
+ bool check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
+ bool check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt);
bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
- bool check_copy_shr_t(copy_shr_t ref, copy_shr_t opt);
- bool check_copy_shl_t(copy_shl_t ref, copy_shl_t opt);
bool check_pixel_var(var_t ref, var_t opt);
bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
More information about the x265-devel
mailing list