[x265] [PATCH 1 of 2] Merged buffer extension with Horizontal filter; integrated with encoder
deepthidevaki at multicorewareinc.com
deepthidevaki at multicorewareinc.com
Fri Jun 28 15:17:10 CEST 2013
# HG changeset patch
# User Deepthi Devaki
# Date 1372417533 -19800
# Node ID 7f1f8f3b77069aea10a3d4c54e0a8a9554e7bf19
# Parent c79ed90edca573a569751842243331c588137836
Merged buffer extension with Horizontal filter; integrated with encoder
diff -r c79ed90edca5 -r 7f1f8f3b7706 source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp Fri Jun 28 02:09:19 2013 -0500
+++ b/source/common/ipfilter.cpp Fri Jun 28 16:35:33 2013 +0530
@@ -476,7 +476,7 @@
xExtendPicCompBorder(dstP, dstStride, block_width, block_height, marginX, marginY);
}
-void filterHorizontalMultiplane(int bitDepth, pixel *src, int srcStride, short *midF, short* midA, short* midB, short* midC, int midStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height)
+void filterHorizontalMultiplaneExtend(int bitDepth, pixel *src, int srcStride, short *midF, short* midA, short* midB, short* midC, int midStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height, int marginX, int marginY)
{
filterConvertPelToShort(bitDepth, src, srcStride, midF, midStride, block_width, block_height);
filterHorizontal_pel_short<8>(bitDepth, src, srcStride, midB, midStride, block_width, block_height, TComInterpolationFilter::m_lumaFilter[2]);
@@ -485,6 +485,10 @@
filterConvertShortToPel(bitDepth, midA, midStride, pDstA, pDstStride, block_width, block_height);
filterConvertShortToPel(bitDepth, midB, midStride, pDstB, pDstStride, block_width, block_height);
filterConvertShortToPel(bitDepth, midC, midStride, pDstC, pDstStride, block_width, block_height);
+
+ xExtendPicCompBorder(pDstA, pDstStride, block_width, block_height, marginX, marginY);
+ xExtendPicCompBorder(pDstB, pDstStride, block_width, block_height, marginX, marginY);
+ xExtendPicCompBorder(pDstC, pDstStride, block_width, block_height, marginX, marginY);
}
}
@@ -516,6 +520,6 @@
p.ipFilter_s_s[FILTER_V_S_S_4] = filterVertical_short_short<4>;
p.filterVmulti = filterVerticalMultiplaneExtend;
- p.filterHmulti = filterHorizontalMultiplane;
+ p.filterHmulti = filterHorizontalMultiplaneExtend;
}
}
diff -r c79ed90edca5 -r 7f1f8f3b7706 source/common/primitives.h
--- a/source/common/primitives.h Fri Jun 28 02:09:19 2013 -0500
+++ b/source/common/primitives.h Fri Jun 28 16:35:33 2013 +0530
@@ -210,7 +210,7 @@
typedef void (*calcrecon_t)(pixel* piPred, short* piResi, pixel* piReco, short* piRecQt, pixel *piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride);
typedef void (*transpose_t)(pixel* pDst, pixel* pSrc, intptr_t nStride);
typedef void (*filterVmulti_t)(int bitDepth, short *src, int srcStride, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height, int marginX, int marginY);
-typedef void (*filterHmulti_t)(int bitDepth, pixel *src, int srcStride, short *midF, short* midA, short* midB, short* midC, int midStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height);
+typedef void (*filterHmulti_t)(int bitDepth, pixel *src, int srcStride, short *midF, short* midA, short* midB, short* midC, int midStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height, int marginX, int marginY);
typedef void (*dequant_t)(int bitDepth, const int* pSrc, int* pDes, int iWidth, int iHeight, int mcqp_miper, int mcqp_mirem, bool useScalingList, unsigned int uiLog2TrSize, int *piDequantCoef);
typedef uint32_t (*quantaq_t)(int *coef, int *quantCoeff, int *deltaU, int *qCoef, int *arlCCoef, int qBitsC, int qBits, int add, int numCoeff);
typedef uint32_t (*quant_t)(int *coef, int *quantCoeff, int *deltaU, int *qCoef, int qBits, int add, int numCoeff);
diff -r c79ed90edca5 -r 7f1f8f3b7706 source/common/reference.cpp
--- a/source/common/reference.cpp Fri Jun 28 02:09:19 2013 -0500
+++ b/source/common/reference.cpp Fri Jun 28 16:35:33 2013 +0530
@@ -107,7 +107,7 @@
primitives.filterHmulti(g_bitDepthY, srcPtr, m_lumaStride, // source buffer
intPtrF, intPtrA, intPtrB, intPtrC, m_intStride, // 4 intermediate HPEL buffers
m_lumaPlane[1][0] + bufOffset, m_lumaPlane[2][0] + bufOffset, m_lumaPlane[3][0] + bufOffset, m_lumaStride, // 3 (x=n, y=0) output buffers (no V interp)
- m_filterWidth + (2 * s_intMarginX), m_filterHeight + (2 * s_intMarginY));
+ m_filterWidth + (2 * s_intMarginX), m_filterHeight + (2 * s_intMarginY), m_reconPic->m_iLumaMarginX - s_tmpMarginX - s_intMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY - s_intMarginY);
}
if (!m_pool)
@@ -164,10 +164,10 @@
pixel *dstPtr3 = m_lumaPlane[x][3] - s_tmpMarginY * m_lumaStride - s_tmpMarginX;
primitives.filterVmulti(g_bitDepthY, intPtr, m_intStride, dstPtr1, dstPtr2, dstPtr3, m_lumaStride, m_filterWidth, m_filterHeight, m_reconPic->m_iLumaMarginX - s_tmpMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY);
- if (x > 0)
- {
- /* the Y=0 planes were not extended by the horizontal filter */
- int bufOffset = -(s_tmpMarginY + s_intMarginY) * m_lumaStride - (s_tmpMarginX + s_intMarginX);
- m_reconPic->xExtendPicCompBorder((Pel *)m_lumaPlane[x][0] + bufOffset, m_lumaStride, m_filterWidth + (2 * s_intMarginX), m_filterHeight + (2 * s_intMarginY), m_reconPic->m_iLumaMarginX - s_tmpMarginX - s_intMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY - s_intMarginY);
- }
+ //if (x > 0)
+ //{
+ // /* the Y=0 planes were not extended by the horizontal filter */
+ // int bufOffset = -(s_tmpMarginY + s_intMarginY) * m_lumaStride - (s_tmpMarginX + s_intMarginX);
+ // m_reconPic->xExtendPicCompBorder((Pel *)m_lumaPlane[x][0] + bufOffset, m_lumaStride, m_filterWidth + (2 * s_intMarginX), m_filterHeight + (2 * s_intMarginY), m_reconPic->m_iLumaMarginX - s_tmpMarginX - s_intMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY - s_intMarginY);
+ //}
}
diff -r c79ed90edca5 -r 7f1f8f3b7706 source/common/vec/ipfilter.inc
--- a/source/common/vec/ipfilter.inc Fri Jun 28 02:09:19 2013 -0500
+++ b/source/common/vec/ipfilter.inc Fri Jun 28 16:35:33 2013 +0530
@@ -55,7 +55,7 @@
#if !HIGH_BIT_DEPTH
p.filterVmulti = filterVerticalMultiplaneExtend;
- p.filterHmulti = filterHorizontalMultiplane;
+ p.filterHmulti = filterHorizontalMultiplaneExtend;
#endif
}
}
diff -r c79ed90edca5 -r 7f1f8f3b7706 source/common/vec/ipfilter8.inc
--- a/source/common/vec/ipfilter8.inc Fri Jun 28 02:09:19 2013 -0500
+++ b/source/common/vec/ipfilter8.inc Fri Jun 28 16:35:33 2013 +0530
@@ -773,8 +773,8 @@
}
}
-void filterHorizontalMultiplane(int /*bitDepth*/, pixel *src, int srcStride, short *dstF, short* dstA, short* dstB, short* dstC, int dstStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height)
-{
+#if INSTRSET < 5
+void filterHorizontalMultiplaneExtend(int /*bitDepth*/, pixel *src, int srcStride, short *dstF, short* dstA, short* dstB, short* dstC, int dstStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height, int marginX, int marginY){
int row, col;
src -= (8 / 2 - 1);
@@ -789,27 +789,22 @@
Vec8s vec_offset(offset);
Vec8s sumaL, sumaH, sumbL, sumbH, sumcL, sumcH, tmp, exp1;
Vec8s valL, valH;
- // Load Ai, ai += Ai*coefi
+ Vec16uc tmp16a, tmp16b, tmp16c;
+ // Load Ai, ai += Ai*coefi
for (row = 0; row < block_height; row++)
{
col = 0;
- for (; col + 16 <= (block_width); col += 16) // Iterations multiple of 8
- {
vec_src0.load(src + col);
sumbL = -(extend_low(vec_src0));
- sumbH = -(extend_high(vec_src0));
// a = b+=4*a1, c+=1*a1
vec_src0.load(src + col + 1); // Load the 8 elements
sumcL = extend_low(vec_src0);
sumbL += (sumcL << 2);
sumaL = sumbL;
- sumcH = extend_high(vec_src0);
- sumbH += (sumcH << 2);
- sumaH = sumbH;
-
+
// a +=-10*a2 b+=-11*a2 c+=-5*a2
vec_src0.load(src + col + 2);
tmp = extend_low(vec_src0);
@@ -819,13 +814,6 @@
tmp <<= 1;
sumaL += tmp;
sumbL += tmp;
- tmp = extend_high(vec_src0);
- sumbH -= tmp;
- tmp *= (-5);
- sumcH += tmp;
- tmp <<= 1;
- sumaH += tmp;
- sumbH += tmp;
// a +=58*a3 b+=40*a3 c+=17*a3
vec_src0.load(src + col + 3);
@@ -837,15 +825,7 @@
tmp *= 40;
sumbL += tmp;
sumaL += (tmp + exp1);
- tmp = extend_high(vec_src0);
- ((tmp << 6) - IF_INTERNAL_OFFS).store(dstF + col + 8); // storing A as short into intermediate buffer
- exp1 = (tmp << 4) + tmp;
- sumcH += exp1;
- sumaH += tmp;
- tmp *= 40;
- sumbH += tmp;
- sumaH += (tmp + exp1);
-
+
// a +=17*a4 b+=40*a4 c+=58*a4
vec_src0.load(src + col + 4);
tmp = extend_low(vec_src0);
@@ -855,13 +835,108 @@
tmp *= 40;
sumbL += tmp;
sumcL += (tmp + exp1);
- tmp = extend_high(vec_src0);
+
+ // a +=-5*a5 b+=-11*a5 c+=-10*a5
+ vec_src0.load(src + col + 5);
+ tmp = extend_low(vec_src0);
+ sumbL -= tmp;
+ tmp *= (-5);
+ sumaL += tmp;
+ tmp <<= 1;
+ sumcL += tmp;
+ sumbL += tmp;
+
+ // a +=1*a6 b+=4*a6 c+=4*a6
+ vec_src0.load(src + col + 6);
+ tmp = extend_low(vec_src0);
+ sumaL += tmp;
+ tmp <<= 2;
+ sumbL += tmp;
+ sumcL += tmp;
+
+ // a +=0*a7 b+=-1*a7 c+=-1*a7
+ vec_src0.load(src + col + 7);
+ tmp = extend_low(vec_src0);
+ sumbL -= tmp;
+ sumcL -= tmp;
+ sumaL = (sumaL + vec_offset); // Add offset to sum_low
+ sumbL = (sumbL + vec_offset);
+ sumcL = (sumcL + vec_offset);
+
+ sumaL.store(dstA + col); // Store vector
+ sumaL = (sumaL + IF_INTERNAL_OFFS + 32) >> 6;
+ tmp16a = compress_unsafe(sumaL, sumaL); // Its certainly safe, just a misnomer
+ tmp16a.store_partial(8,pDstA + row * pDstStride + col);
+
+ sumbL.store(dstB + col);
+ sumbL = (sumbL + IF_INTERNAL_OFFS + 32) >> 6;
+ tmp16b = compress_unsafe(sumbL, sumbL);
+ tmp16b.store_partial(8,pDstB + row * pDstStride + col);
+
+ sumcL.store(dstC + col);
+ sumcL = (sumcL + IF_INTERNAL_OFFS + 32) >> 6;
+ tmp16c = compress_unsafe(sumcL, sumcL);
+ tmp16c.store_partial(8,pDstC + row * pDstStride + col);
+
+ //Extend First column
+ __m128i ma, mb, mc;
+ ma = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>((Vec16uc)tmp16a);
+ mb = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>((Vec16uc)tmp16b);
+ mc = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>((Vec16uc)tmp16c);
+
+ for (int i = -marginX; i < -16; i += 16)
+ {
+ _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride + i), ma);
+ _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride + i), mb);
+ _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride + i), mc);
+ }
+ _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride - 16), ma); /*Assuming marginX > 16*/
+ _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride - 16), mb);
+ _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride - 16), mc);
+
+ col+=8;
+
+ for (; col + 8/*16*/ <= (block_width); col += 8/*16*/) // Iterations multiple of 8
+ {
+ vec_src0.load(src + col);
+ sumbL = -(extend_low(vec_src0));
+
+ // a = b+=4*a1, c+=1*a1
+ vec_src0.load(src + col + 1); // Load the 8 elements
+ sumcL = extend_low(vec_src0);
+ sumbL += (sumcL << 2);
+ sumaL = sumbL;
+
+ // a +=-10*a2 b+=-11*a2 c+=-5*a2
+ vec_src0.load(src + col + 2);
+ tmp = extend_low(vec_src0);
+ sumbL -= tmp;
+ tmp *= (-5);
+ sumcL += tmp;
+ tmp <<= 1;
+ sumaL += tmp;
+ sumbL += tmp;
+
+ // a +=58*a3 b+=40*a3 c+=17*a3
+ vec_src0.load(src + col + 3);
+ tmp = extend_low(vec_src0);
+ ((tmp << 6) - IF_INTERNAL_OFFS).store(dstF + col); // storing A as short into intermediate buffer
exp1 = (tmp << 4) + tmp;
- sumaH += exp1;
- sumcH += tmp;
+ sumcL += exp1;
+ sumaL += tmp;
tmp *= 40;
- sumbH += tmp;
- sumcH += (tmp + exp1);
+ sumbL += tmp;
+ sumaL += (tmp + exp1);
+
+ // a +=17*a4 b+=40*a4 c+=58*a4
+ vec_src0.load(src + col + 4);
+ tmp = extend_low(vec_src0);
+ exp1 = (tmp << 4) + tmp;
+ sumaL += exp1;
+ sumcL += tmp;
+ tmp *= 40;
+ sumbL += tmp;
+ sumcL += (tmp + exp1);
// a +=-5*a5 b+=-11*a5 c+=-10*a5
vec_src0.load(src + col + 5);
@@ -872,14 +947,7 @@
tmp <<= 1;
sumcL += tmp;
sumbL += tmp;
- tmp = extend_high(vec_src0);
- sumbH -= tmp;
- tmp *= (-5);
- sumaH += tmp;
- tmp <<= 1;
- sumcH += tmp;
- sumbH += tmp;
-
+
// a +=1*a6 b+=4*a6 c+=4*a6
vec_src0.load(src + col + 6);
tmp = extend_low(vec_src0);
@@ -887,11 +955,6 @@
tmp <<= 2;
sumbL += tmp;
sumcL += tmp;
- tmp = extend_high(vec_src0);
- sumaH += tmp;
- tmp <<= 2;
- sumbH += tmp;
- sumcH += tmp;
// a +=0*a7 b+=-1*a7 c+=-1*a7
vec_src0.load(src + col + 7);
@@ -901,43 +964,31 @@
sumaL = (sumaL + vec_offset); // Add offset to sum_low
sumbL = (sumbL + vec_offset);
sumcL = (sumcL + vec_offset);
- tmp = extend_high(vec_src0);
- sumbH -= tmp;
- sumcH -= tmp;
- sumaH = (sumaH + vec_offset);
- sumbH = (sumbH + vec_offset);
- sumcH = (sumcH + vec_offset);
sumaL.store(dstA + col); // Store vector
- sumaH.store(dstA + col + 8); // Store vector
- valL = (sumaL + IF_INTERNAL_OFFS + 32) >> 6;
- valH = (sumaH + IF_INTERNAL_OFFS + 32) >> 6;
- compress_unsafe(valL, valH).store(pDstA + row * pDstStride + col);
+ sumaL = (sumaL + IF_INTERNAL_OFFS + 32) >> 6;
+ tmp16a = compress_unsafe(sumaL, sumaL);
+ tmp16a.store_partial(8,pDstA + row * pDstStride + col);
sumbL.store(dstB + col);
- sumbH.store(dstB + col + 8);
- valL = (sumbL + IF_INTERNAL_OFFS + 32) >> 6;
- valH = (sumbH + IF_INTERNAL_OFFS + 32) >> 6;
- compress_unsafe(valL, valH).store(pDstB + row * pDstStride + col);
+ sumbL = (sumbL + IF_INTERNAL_OFFS + 32) >> 6;
+ tmp16b = compress_unsafe(sumbL, sumbL);
+ tmp16b.store_partial(8,pDstB + row * pDstStride + col);
sumcL.store(dstC + col);
- sumcH.store(dstC + col + 8);
- valL = (sumcL + IF_INTERNAL_OFFS + 32) >> 6;
- valH = (sumcH + IF_INTERNAL_OFFS + 32) >> 6;
- compress_unsafe(valL, valH).store(pDstC + row * pDstStride + col);
+ sumcL = (sumcL + IF_INTERNAL_OFFS + 32) >> 6;
+ tmp16c = compress_unsafe(sumcL, sumcL);
+ tmp16c.store_partial(8,pDstC + row * pDstStride + col);
+
}
if (block_width - col > 0)
{
- vec_src0.load(src + block_width - 13);
- if (block_width - col > 8)
- {
- tmp = extend_low(vec_src0);
- ((tmp << 6) - IF_INTERNAL_OFFS).store(dstF + block_width - 16);
- }
- tmp = extend_high(vec_src0);
+ vec_src0.load(src + block_width - 5);
+ tmp = extend_low(vec_src0);
((tmp << 6) - IF_INTERNAL_OFFS).store(dstF + block_width - 8);
+ short vala, valb, valc;
for (; col < block_width; col++) // Remaining iterations
{
vec_src0.load(src + col);
@@ -945,9 +996,9 @@
int isuma = horizontal_add(tmp * Vec8s(-1, 4, -10, 58, 17, -5, 1, 0));
int isumb = horizontal_add(tmp * Vec8s(-1, 4, -11, 40, 40, -11, 4, -1));
int isumc = horizontal_add(tmp * Vec8s(0, 1, -5, 17, 58, -10, 4, -1));
- short vala = (short)(isuma + offset) >> shift;
- short valb = (short)(isumb + offset) >> shift;
- short valc = (short)(isumc + offset) >> shift;
+ vala = (short)(isuma + offset) >> shift;
+ valb = (short)(isumb + offset) >> shift;
+ valc = (short)(isumc + offset) >> shift;
dstA[col] = vala;
vala = (vala + IF_INTERNAL_OFFS + 32) >> 6;
if (vala < 0) vala = 0;
@@ -967,14 +1018,359 @@
valc = 255;
pDstC[row * pDstStride + col] = (pixel)valc;
}
+ tmp16a = Vec16uc(vala);
+ tmp16b = Vec16uc(valb);
+ tmp16c = Vec16uc(valc);
}
+ else
+ {
+ tmp16a = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>((Vec16uc)tmp16a);
+ tmp16b = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>((Vec16uc)tmp16b);
+ tmp16c = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>((Vec16uc)tmp16c);
+ }
+ //Extend last column
+ for (int i = -marginX; i < -16; i += 16)
+ {
+ _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride + block_width + marginX + i), tmp16a);
+ _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride + block_width + marginX + i), tmp16b);
+ _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride + block_width + marginX + i), tmp16c);
+ }
+ _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride + block_width + marginX - 16), tmp16a); /*Assuming marginX > 16*/
+ _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride + block_width + marginX - 16), tmp16b);
+ _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride + block_width + marginX - 16), tmp16c);
+
src += srcStride;
dstF += dstStride;
dstA += dstStride;
dstB += dstStride;
dstC += dstStride;
}
+
+ // Extending bottom rows
+ pixel *pe, *pi, *pp;
+ pe = pDstA + (block_height - 1) * pDstStride - marginX;
+ pi = pDstB + (block_height - 1) * pDstStride - marginX;
+ pp = pDstC + (block_height - 1) * pDstStride - marginX;
+ for (int y = 1; y <= marginY; y++)
+ memcpy(pe + y * pDstStride, pe, block_width + marginX * 2);
+ for (int y = 1; y <= marginY; y++)
+ memcpy(pi + y * pDstStride, pi, block_width + marginX * 2);
+ for (int y = 1; y <= marginY; y++)
+ memcpy(pp + y * pDstStride, pp, block_width + marginX * 2);
+
+ // Extending top rows
+ pe -= ((block_height - 1) * dstStride);
+ pi -= ((block_height - 1) * dstStride);
+ pp -= ((block_height - 1) * dstStride);
+ for (int y = 1; y <= marginY; y++)
+ memcpy(pe - y * pDstStride, pe, block_width + marginX * 2);
+ for (int y = 1; y <= marginY; y++)
+ memcpy(pi - y * pDstStride, pi, block_width + marginX * 2);
+ for (int y = 1; y <= marginY; y++)
+ memcpy(pp - y * pDstStride, pp, block_width + marginX * 2);
}
+#else
+void filterHorizontalMultiplaneExtend(int /*bitDepth*/, pixel *src, int srcStride, short *dstF, short* dstA, short* dstB, short* dstC, int dstStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height, int marginX, int marginY)
+{
+ int row, col;
+
+ src -= (8 / 2 - 1);
+ int offset;
+ int headRoom = IF_INTERNAL_PREC - 8;
+ int shift = IF_FILTER_PREC;
+ shift -= headRoom;
+ offset = -IF_INTERNAL_OFFS << shift;
+
+ __m128i vec_src0;
+ __m128i vec_offset = _mm_set1_epi16(offset);
+ __m128i sumaL, sumbL, sumcL, tmp, exp1;
+ __m128i tmp16a, tmp16b, tmp16c;
+
+ // Load Ai, ai += Ai*coefi
+ for (row = 0; row < block_height; row++)
+ {
+ col = 0;
+
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
+ sumbL = (_mm_unpacklo_epi8(vec_src0,_mm_setzero_si128()));
+ sumbL = _mm_sub_epi16(_mm_setzero_si128(), sumbL);
+
+ // a = b+=4*a1, c+=1*a1
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 1));
+ sumcL = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ sumbL = _mm_add_epi16(sumbL, _mm_sll_epi16(sumcL,_mm_cvtsi32_si128(2)));
+ sumaL = sumbL;
+
+ // a +=-10*a2 b+=-11*a2 c+=-5*a2
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 2));
+ tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ sumbL = _mm_sub_epi16(sumbL, tmp);
+ tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
+ sumcL = _mm_add_epi16(sumcL, tmp);
+ tmp = _mm_sll_epi16(tmp,_mm_cvtsi32_si128(1));
+ sumaL = _mm_add_epi16(sumaL, tmp);
+ sumbL = _mm_add_epi16(sumbL, tmp);
+
+
+ // a +=58*a3 b+=40*a3 c+=17*a3
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 3));
+ tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ _mm_storeu_si128((__m128i*)(dstF+col),_mm_sub_epi16(_mm_sll_epi16(tmp,_mm_cvtsi32_si128(6)), _mm_set1_epi16(IF_INTERNAL_OFFS)));
+ exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp,_mm_cvtsi32_si128(4)));
+ sumcL = _mm_add_epi16(sumcL, exp1);
+ sumaL = _mm_add_epi16(sumaL, tmp);
+ tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
+ sumbL = _mm_add_epi16(sumbL, tmp);
+ sumaL = _mm_add_epi16(sumaL, _mm_add_epi16(exp1, tmp));
+
+ // a +=17*a4 b+=40*a4 c+=58*a4
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 4));
+ tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp,_mm_cvtsi32_si128(4)));
+ sumaL = _mm_add_epi16(sumaL, exp1);
+ sumcL = _mm_add_epi16(sumcL, tmp);
+ tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
+ sumbL = _mm_add_epi16(sumbL, tmp);
+ sumcL = _mm_add_epi16(sumcL, _mm_add_epi16(exp1, tmp));
+
+ // a +=-5*a5 b+=-11*a5 c+=-10*a5
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 5));
+ tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ sumbL = _mm_sub_epi16(sumbL, tmp);
+ tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
+ sumaL = _mm_add_epi16(sumaL, tmp);
+ tmp =_mm_sll_epi16(tmp,_mm_cvtsi32_si128(1));
+ sumcL = _mm_add_epi16(sumcL, tmp);
+ sumbL = _mm_add_epi16(sumbL, tmp);
+
+ // a +=1*a6 b+=4*a6 c+=4*a6
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 6));
+ tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ sumaL = _mm_add_epi16(sumaL, tmp);
+ tmp = _mm_sll_epi16(tmp,_mm_cvtsi32_si128(2));
+ sumbL = _mm_add_epi16(sumbL, tmp);
+ sumcL = _mm_add_epi16(sumcL, tmp);
+
+ // a +=0*a7 b+=-1*a7 c+=-1*a7
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 7));
+ tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ sumbL = _mm_sub_epi16(sumbL, tmp);
+ sumcL = _mm_sub_epi16(sumcL, tmp);
+ sumaL = _mm_add_epi16(sumaL, vec_offset);
+ sumbL = _mm_add_epi16(sumbL, vec_offset);
+ sumcL = _mm_add_epi16(sumcL, vec_offset);
+
+ _mm_storeu_si128((__m128i*)(dstA+col),sumaL);
+ sumaL = _mm_add_epi16(sumaL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+ sumaL = _mm_sra_epi16(sumaL,_mm_cvtsi32_si128(6));
+ tmp16a = _mm_packus_epi16(sumaL,sumaL);
+ _mm_storel_epi64((__m128i*)(pDstA + row * pDstStride + col),tmp16a);
+
+ _mm_storeu_si128((__m128i*)(dstB+col),sumbL);
+ sumbL = _mm_add_epi16(sumbL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+ sumbL = _mm_sra_epi16(sumbL,_mm_cvtsi32_si128(6));
+ tmp16b = _mm_packus_epi16(sumbL,sumbL);
+ _mm_storel_epi64((__m128i*)(pDstB + row * pDstStride + col),tmp16b);
+
+ _mm_storeu_si128((__m128i*)(dstC+col),sumcL);
+ sumcL = _mm_add_epi16(sumcL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+ sumcL = _mm_sra_epi16(sumcL,_mm_cvtsi32_si128(6));
+ tmp16c = _mm_packus_epi16(sumcL,sumcL);
+ _mm_storel_epi64((__m128i*)(pDstC + row * pDstStride + col),tmp16c);
+
+ //Extend First column
+ __m128i ma, mb, mc;
+ ma = _mm_shuffle_epi8(tmp16a , _mm_set1_epi8(0));
+ mb = _mm_shuffle_epi8(tmp16b , _mm_set1_epi8(0));
+ mc = _mm_shuffle_epi8(tmp16c , _mm_set1_epi8(0));
+
+ for (int i = -marginX; i < -16; i += 16)
+ {
+ _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride + i), ma);
+ _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride + i), mb);
+ _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride + i), mc);
+ }
+ _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride - 16), ma); /*Assuming marginX > 16*/
+ _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride - 16), mb);
+ _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride - 16), mc);
+
+ col+=8;
+
+ for (; col + 8/*16*/ <= (block_width); col += 8/*16*/) // Iterations multiple of 8
+ {
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
+ sumbL = (_mm_unpacklo_epi8(vec_src0,_mm_setzero_si128()));
+ sumbL = _mm_sub_epi16(_mm_setzero_si128(), sumbL);
+
+ // a = b+=4*a1, c+=1*a1
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 1));
+ sumcL = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ sumbL = _mm_add_epi16(sumbL, _mm_sll_epi16(sumcL,_mm_cvtsi32_si128(2)));
+ sumaL = sumbL;
+
+ // a +=-10*a2 b+=-11*a2 c+=-5*a2
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 2));
+ tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ sumbL = _mm_sub_epi16(sumbL, tmp);
+ tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
+ sumcL = _mm_add_epi16(sumcL, tmp);
+ tmp = _mm_sll_epi16(tmp,_mm_cvtsi32_si128(1));
+ sumaL = _mm_add_epi16(sumaL, tmp);
+ sumbL = _mm_add_epi16(sumbL, tmp);
+
+
+ // a +=58*a3 b+=40*a3 c+=17*a3
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 3));
+ tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ _mm_storeu_si128((__m128i*)(dstF+col),_mm_sub_epi16(_mm_sll_epi16(tmp,_mm_cvtsi32_si128(6)), _mm_set1_epi16(IF_INTERNAL_OFFS)));
+ exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp,_mm_cvtsi32_si128(4)));
+ sumcL = _mm_add_epi16(sumcL, exp1);
+ sumaL = _mm_add_epi16(sumaL, tmp);
+ tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
+ sumbL = _mm_add_epi16(sumbL, tmp);
+ sumaL = _mm_add_epi16(sumaL, _mm_add_epi16(exp1, tmp));
+
+ // a +=17*a4 b+=40*a4 c+=58*a4
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 4));
+ tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp,_mm_cvtsi32_si128(4)));
+ sumaL = _mm_add_epi16(sumaL, exp1);
+ sumcL = _mm_add_epi16(sumcL, tmp);
+ tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
+ sumbL = _mm_add_epi16(sumbL, tmp);
+ sumcL = _mm_add_epi16(sumcL, _mm_add_epi16(exp1, tmp));
+
+ // a +=-5*a5 b+=-11*a5 c+=-10*a5
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 5));
+ tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ sumbL = _mm_sub_epi16(sumbL, tmp);
+ tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
+ sumaL = _mm_add_epi16(sumaL, tmp);
+ tmp =_mm_sll_epi16(tmp,_mm_cvtsi32_si128(1));
+ sumcL = _mm_add_epi16(sumcL, tmp);
+ sumbL = _mm_add_epi16(sumbL, tmp);
+
+ // a +=1*a6 b+=4*a6 c+=4*a6
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 6));
+ tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ sumaL = _mm_add_epi16(sumaL, tmp);
+ tmp = _mm_sll_epi16(tmp,_mm_cvtsi32_si128(2));
+ sumbL = _mm_add_epi16(sumbL, tmp);
+ sumcL = _mm_add_epi16(sumcL, tmp);
+
+ // a +=0*a7 b+=-1*a7 c+=-1*a7
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 7));
+ tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ sumbL = _mm_sub_epi16(sumbL, tmp);
+ sumcL = _mm_sub_epi16(sumcL, tmp);
+ sumaL = _mm_add_epi16(sumaL, vec_offset);
+ sumbL = _mm_add_epi16(sumbL, vec_offset);
+ sumcL = _mm_add_epi16(sumcL, vec_offset);
+
+ _mm_storeu_si128((__m128i*)(dstA+col),sumaL);
+ sumaL = _mm_add_epi16(sumaL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+ sumaL = _mm_sra_epi16(sumaL,_mm_cvtsi32_si128(6));
+ tmp16a = _mm_packus_epi16(sumaL,sumaL);
+ _mm_storel_epi64((__m128i*)(pDstA + row * pDstStride + col),tmp16a);
+
+ _mm_storeu_si128((__m128i*)(dstB+col),sumbL);
+ sumbL = _mm_add_epi16(sumbL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+ sumbL = _mm_sra_epi16(sumbL,_mm_cvtsi32_si128(6));
+ tmp16b = _mm_packus_epi16(sumbL,sumbL);
+ _mm_storel_epi64((__m128i*)(pDstB + row * pDstStride + col),tmp16b);
+
+ _mm_storeu_si128((__m128i*)(dstC+col),sumcL);
+ sumcL = _mm_add_epi16(sumcL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+ sumcL = _mm_sra_epi16(sumcL,_mm_cvtsi32_si128(6));
+ tmp16c = _mm_packus_epi16(sumcL,sumcL);
+ _mm_storel_epi64((__m128i*)(pDstC + row * pDstStride + col),tmp16c);
+
+ }
+
+ if (block_width - col > 0)
+ {
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + block_width - 5));
+ tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+ _mm_storeu_si128((__m128i*)(dstF + block_width - 8),_mm_sub_epi16(_mm_sll_epi16(tmp,_mm_cvtsi32_si128(6)), _mm_set1_epi16(IF_INTERNAL_OFFS)));
+ __m128i a, b, c, sum1, sum2, sum3=_mm_setzero_si128();
+ for (; col < block_width; col++) // Remaining iterations
+ {
+ vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
+ tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128()); // Assuming that there is no overflow (Everywhere in this function!)
+ a = _mm_setr_epi16(-1, 4, -10, 58, 17, -5, 1, 0);
+ a = _mm_mullo_epi16(tmp, a);
+ b = _mm_setr_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
+ b = _mm_mullo_epi16(tmp, b);
+ c = _mm_setr_epi16(0, 1, -5, 17, 58, -10, 4, -1);
+ c = _mm_mullo_epi16(tmp, c);
+ sum1 = _mm_hadd_epi16(a,b); // horizontally add 8 elements in 3 steps
+ sum2 = _mm_hadd_epi16(c,c);
+ sum2 = _mm_hadd_epi16(sum1,sum2);
+ sum3 = _mm_hadd_epi16(sum2,sum2);
+ sum3 = _mm_add_epi16(sum3, vec_offset);
+ sum3 = _mm_sra_epi16(sum3,_mm_cvtsi32_si128(shift));
+ dstA[col] = _mm_cvtsi128_si32(sum3);
+ dstB[col] = _mm_extract_epi16(sum3, 1);
+ dstC[col] = _mm_extract_epi16(sum3, 2);
+ sum3 = _mm_add_epi16(sum3, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+ sum3 = _mm_sra_epi16(sum3,_mm_cvtsi32_si128(6));
+ sum3 = _mm_packus_epi16(sum3, sum3);
+ pDstA[row * pDstStride + col] = _mm_extract_epi8(sum3, 0);
+ pDstB[row * pDstStride + col] = _mm_extract_epi8(sum3, 1);
+ pDstC[row * pDstStride + col] = _mm_extract_epi8(sum3, 2);
+ }
+ tmp16a = _mm_shuffle_epi8(sum3 , _mm_set1_epi8(0));
+ tmp16b = _mm_shuffle_epi8(sum3 , _mm_set1_epi8(1));
+ tmp16c = _mm_shuffle_epi8(sum3 , _mm_set1_epi8(2));
+ }
+ else
+ {
+ tmp16a = _mm_shuffle_epi8(tmp16a , _mm_set1_epi8(15));
+ tmp16b = _mm_shuffle_epi8(tmp16b , _mm_set1_epi8(15));
+ tmp16c = _mm_shuffle_epi8(tmp16c , _mm_set1_epi8(15));
+ }
+ //Extend last column
+ for (int i = -marginX; i < -16; i += 16)
+ {
+ _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride + block_width + marginX + i), tmp16a);
+ _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride + block_width + marginX + i), tmp16b);
+ _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride + block_width + marginX + i), tmp16c);
+ }
+ _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride + block_width + marginX - 16), tmp16a); /*Assuming marginX > 16*/
+ _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride + block_width + marginX - 16), tmp16b);
+ _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride + block_width + marginX - 16), tmp16c);
+
+ src += srcStride;
+ dstF += dstStride;
+ dstA += dstStride;
+ dstB += dstStride;
+ dstC += dstStride;
+ }
+
+ // Extending bottom rows
+ pixel *pe, *pi, *pp;
+ pe = pDstA + (block_height - 1) * pDstStride - marginX;
+ pi = pDstB + (block_height - 1) * pDstStride - marginX;
+ pp = pDstC + (block_height - 1) * pDstStride - marginX;
+ for (int y = 1; y <= marginY; y++)
+ memcpy(pe + y * pDstStride, pe, block_width + marginX * 2);
+ for (int y = 1; y <= marginY; y++)
+ memcpy(pi + y * pDstStride, pi, block_width + marginX * 2);
+ for (int y = 1; y <= marginY; y++)
+ memcpy(pp + y * pDstStride, pp, block_width + marginX * 2);
+
+ // Extending top rows
+ pe -= ((block_height - 1) * dstStride);
+ pi -= ((block_height - 1) * dstStride);
+ pp -= ((block_height - 1) * dstStride);
+ for (int y = 1; y <= marginY; y++)
+ memcpy(pe - y * pDstStride, pe, block_width + marginX * 2);
+ for (int y = 1; y <= marginY; y++)
+ memcpy(pi - y * pDstStride, pi, block_width + marginX * 2);
+ for (int y = 1; y <= marginY; y++)
+ memcpy(pp - y * pDstStride, pp, block_width + marginX * 2);
+}
+#endif
template<int N>
void filterHorizontal_pel_short(int bitDepth, pixel *src, int srcStride, short *dst, int dstStride, int block_width, int block_height, short const *coeff)
diff -r c79ed90edca5 -r 7f1f8f3b7706 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp Fri Jun 28 02:09:19 2013 -0500
+++ b/source/test/ipfilterharness.cpp Fri Jun 28 16:35:33 2013 +0530
@@ -313,9 +313,10 @@
bool IPFilterHarness::check_filterHMultiplane(x265::filterHmulti_t ref, x265::filterHmulti_t opt)
{
- short rand_height = 32 + 9; // Can be randomly generated Height
- short rand_width = 32 + 15; // Can be randomly generated Width
- short rand_srcStride, rand_dstStride;
+ short rand_height;
+ short rand_width;
+ int rand_srcStride, rand_dstStride;
+ int marginX, marginY;
short dstAvec[100 * 100];
short dstEvec[100 * 100];
@@ -325,12 +326,12 @@
short dstEref[100 * 100];
short dstIref[100 * 100];
short dstPref[100 * 100];
- pixel pDstAvec[100 * 100];
- pixel pDstAref[100 * 100];
- pixel pDstBvec[100 * 100];
- pixel pDstBref[100 * 100];
- pixel pDstCvec[100 * 100];
- pixel pDstCref[100 * 100];
+ pixel pDstAvec[200 * 200];
+ pixel pDstAref[200 * 200];
+ pixel pDstBvec[200 * 200];
+ pixel pDstBref[200 * 200];
+ pixel pDstCvec[200 * 200];
+ pixel pDstCref[200 * 200];
memset(dstAref, 0, 10000 * sizeof(short));
memset(dstEref, 0, 10000 * sizeof(short));
@@ -340,34 +341,37 @@
memset(dstEvec, 0, 10000 * sizeof(short));
memset(dstIvec, 0, 10000 * sizeof(short));
memset(dstPvec, 0, 10000 * sizeof(short));
- memset(pDstAvec, 0, 10000 * sizeof(pixel));
- memset(pDstAref, 0, 10000 * sizeof(pixel));
- memset(pDstBvec, 0, 10000 * sizeof(pixel));
- memset(pDstBref, 0, 10000 * sizeof(pixel));
- memset(pDstCvec, 0, 10000 * sizeof(pixel));
- memset(pDstCref, 0, 10000 * sizeof(pixel));
+ memset(pDstAvec, 0, 40000 * sizeof(pixel));
+ memset(pDstAref, 0, 40000 * sizeof(pixel));
+ memset(pDstBvec, 0, 40000 * sizeof(pixel));
+ memset(pDstBref, 0, 40000 * sizeof(pixel));
+ memset(pDstCvec, 0, 40000 * sizeof(pixel));
+ memset(pDstCref, 0, 40000 * sizeof(pixel));
for (int i = 0; i <= 100; i++)
{
- rand_srcStride = 64; // Can be randomly generated
- rand_dstStride = 64;
+ rand_height = (rand() % 32) + 1;
+ rand_width = (rand() % 32) + 8;
+ marginX = (rand()%16)+16;
+ marginY = (rand()%16)+16;
+ rand_srcStride = rand_width; // Can be randomly generated
+ rand_dstStride = rand_width+2*marginX;
opt(8, pixel_buff + 3 * rand_srcStride,
rand_srcStride,
dstAvec, dstEvec, dstIvec, dstPvec,
- rand_dstStride, pDstAvec, pDstBvec, pDstCvec, rand_dstStride,
- rand_width,
- rand_height);
+ rand_dstStride, pDstAvec+marginY*rand_dstStride+marginX, pDstBvec+marginY*rand_dstStride+marginX, pDstCvec+marginY*rand_dstStride+marginX,
+ rand_dstStride, rand_width, rand_height, marginX,marginY);
ref(8, pixel_buff + 3 * rand_srcStride,
rand_srcStride,
dstAref, dstEref, dstIref, dstPref,
- rand_dstStride, pDstAref, pDstBref, pDstCref, rand_dstStride,
+ rand_dstStride, pDstAref+marginY*rand_dstStride+marginX, pDstBref+marginY*rand_dstStride+marginX, pDstCref+marginY*rand_dstStride+marginX, rand_dstStride,
rand_width,
- rand_height);
+ rand_height, marginX,marginY);
if (memcmp(dstAvec, dstAref, 100 * 100 * sizeof(short)) || memcmp(dstEvec, dstEref, 100 * 100 * sizeof(short))
|| memcmp(dstIvec, dstIref, 100 * 100 * sizeof(short)) || memcmp(dstPvec, dstPref, 100 * 100 * sizeof(short))
- || memcmp(pDstAvec, pDstAref, 100 * 100 * sizeof(pixel)) || memcmp(pDstBvec, pDstBref, 100 * 100 * sizeof(pixel))
- || memcmp(pDstCvec, pDstCref, 100 * 100 * sizeof(pixel))
+ || memcmp(pDstAvec, pDstAref, 200 * 200 * sizeof(pixel)) || memcmp(pDstBvec, pDstBref, 200 * 200 * sizeof(pixel))
+ || memcmp(pDstCvec, pDstCref, 200 * 200 * sizeof(pixel))
)
{
return false;
@@ -517,6 +521,6 @@
{
printf("Filter-H-multiplane\t");
REPORT_SPEEDUP(opt.filterHmulti, ref.filterHmulti,
- 8, pixel_buff + 8 * srcStride, srcStride, IPF_vec_output_s, IPF_C_output_s, IPF_vec_output_s, IPF_C_output_s, dstStride, IPF_vec_output_p, IPF_C_output_p, IPF_vec_output_p, dstStride, width, height);
+ 8, pixel_buff + 8 * srcStride, srcStride, IPF_vec_output_s, IPF_C_output_s, IPF_vec_output_s, IPF_C_output_s, dstStride, IPF_vec_output_p+ 64 * 200 + 64, IPF_C_output_p+ 64 * 200 + 64, IPF_vec_output_p+ 64 * 200 + 64, dstStride, width, height,64,64);
}
}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: xhevc_deepthid-1.patch
Type: text/x-patch
Size: 41193 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130628/1afd1305/attachment-0001.bin>
More information about the x265-devel
mailing list