[x265] [PATCH] intrapred: Improvement Inra 64x64 [CHANGES OUTPUTS]
Min Chen
chenm003 at 163.com
Sun Jul 21 06:20:57 CEST 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1374377151 -28800
# Node ID 18447bd07244696ae179f0ed6e23653a9ac8ad54
# Parent 158239bec6c14e44a605937f5ad6df56b0ceacec
intrapred: Improvement Inra 64x64 [CHANGES OUTPUTS]
diff -r 158239bec6c1 -r 18447bd07244 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Fri Jul 19 13:40:22 2013 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Sun Jul 21 11:25:51 2013 +0800
@@ -1968,36 +1968,35 @@
candNum = 0;
UInt modeCosts[35];
- Bool bFilter = (width <= 16);
Pel *pAbove0 = refAbove + width - 1;
Pel *pAbove1 = refAboveFlt + width - 1;
Pel *pLeft0 = refLeft + width - 1;
Pel *pLeft1 = refLeftFlt + width - 1;
- Pel *above = pAbove0;
- Pel *left = pLeft0;
-
- // 1
- primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, pred, stride, width, bFilter);
- modeCosts[DC_IDX] = sa8d(fenc, stride, pred, stride);
-
- // 0
- if (width >= 8 && width <= 32)
- {
- above = pAbove1;
- left = pLeft1;
- }
- primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, pred, stride, width);
- modeCosts[PLANAR_IDX] = sa8d(fenc, stride, pred, stride);
// 33 Angle modes once
+ ALIGN_VAR_32(Pel, buf_trans[32 * 32]);
+ ALIGN_VAR_32(Pel, tmp[33 * 32 * 32]);
+
if (width <= 32)
{
- ALIGN_VAR_32(Pel, buf1[MAX_CU_SIZE * MAX_CU_SIZE]);
- ALIGN_VAR_32(Pel, tmp[33 * MAX_CU_SIZE * MAX_CU_SIZE]);
+ // 1
+ primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, pred, stride, width, (width <= 16));
+ modeCosts[DC_IDX] = sa8d(fenc, stride, pred, stride);
+
+ // 0
+ Pel *above = pAbove0;
+ Pel *left = pLeft0;
+ if (width >= 8 && width <= 32)
+ {
+ above = pAbove1;
+ left = pLeft1;
+ }
+ primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, pred, stride, width);
+ modeCosts[PLANAR_IDX] = sa8d(fenc, stride, pred, stride);
// Transpose NxN
- x265::primitives.transpose[log2SizeMinus2](buf1, fenc, stride);
+ x265::primitives.transpose[log2SizeMinus2](buf_trans, (pixel*)fenc, stride);
x265::primitives.intra_pred_allangs[log2SizeMinus2](tmp, pAbove0, pLeft0, pAbove1, pLeft1, (width <= 16));
@@ -2005,18 +2004,59 @@
for (UInt mode = 2; mode < numModesAvailable; mode++)
{
bool modeHor = (mode < 18);
- Pel *cmp = (modeHor ? buf1 : fenc);
+ Pel *cmp = (modeHor ? buf_trans : fenc);
intptr_t srcStride = (modeHor ? width : stride);
modeCosts[mode] = sa8d(cmp, srcStride, &tmp[(mode - 2) * (width * width)], width);
}
}
else
{
+ // origin is 64x64, we scale to 32x32
+ // TODO: option
+#if 1
+ ALIGN_VAR_32(Pel, buf_scale[32 * 32]);
+ x265::primitives.scale2D_64to32(buf_scale, fenc, stride);
+ x265::primitives.transpose[3](buf_trans, buf_scale, 32);
+
+ Pel above[2 * 32 + 1];
+ Pel left[2 * 32 + 1];
+
+ above[0] = left[0] = pAbove0[0];
+ x265::primitives.scale1D_128to64(above + 1, pAbove0 + 1, 0);
+ x265::primitives.scale1D_128to64(left + 1, pLeft0 + 1, 0);
+
+ // 1
+ primitives.intra_pred_dc(above + 1, left + 1, tmp, 32, 32, false);
+ modeCosts[DC_IDX] = 4 * x265::primitives.sa8d[3](buf_scale, 32, tmp, 32);
+
+ // 0
+ primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, tmp, 32, 32);
+ modeCosts[PLANAR_IDX] = 4 * x265::primitives.sa8d[3](buf_scale, 32, tmp, 32);
+
+ x265::primitives.intra_pred_allangs[3](tmp, above, left, above, left, false);
+
+ // TODO: I use 4 of SATD32x32 to replace real 64x64
+ for (UInt mode = 2; mode < numModesAvailable; mode++)
+ {
+ bool modeHor = (mode < 18);
+ Pel *cmp_buf = (modeHor ? buf_trans : buf_scale);
+ modeCosts[mode] = 4 * x265::primitives.sa8d[3]((pixel*)cmp_buf, 32, (pixel*)&tmp[(mode - 2) * (32 * 32)], 32);
+ }
+#else
+ // 1
+ primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, pred, stride, width, false);
+ modeCosts[DC_IDX] = sa8d(fenc, stride, pred, stride);
+
+ // 0
+ primitives.intra_pred_planar((pixel*)pAbove0 + 1, (pixel*)pLeft0 + 1, pred, stride, width);
+ modeCosts[PLANAR_IDX] = sa8d(fenc, stride, pred, stride);
+
for (UInt mode = 2; mode < numModesAvailable; mode++)
{
predIntraLumaAng(mode, pred, stride, width);
modeCosts[mode] = sa8d(fenc, stride, pred, stride);
}
+#endif
}
// Find N least cost modes. N = numModesForFullRD
diff -r 158239bec6c1 -r 18447bd07244 source/common/pixel.cpp
--- a/source/common/pixel.cpp Fri Jul 19 13:40:22 2013 -0500
+++ b/source/common/pixel.cpp Sun Jul 21 11:25:51 2013 +0800
@@ -565,6 +565,39 @@
}
}
+void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
+{
+ int x;
+
+ for (x = 0; x < 64; x += 2)
+ {
+ pixel pix0 = src[(x + 0)];
+ pixel pix1 = src[(x + 1)];
+ int sum = pix0 + pix1;
+
+ dst[x >> 1] = (pixel)((sum + 1) >> 1);
+ }
+}
+
+void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
+{
+ int x, y;
+
+ for (y = 0; y < 64; y += 2)
+ {
+ for (x = 0; x < 64; x += 2)
+ {
+ pixel pix0 = src[(y + 0) * stride + (x + 0)];
+ pixel pix1 = src[(y + 0) * stride + (x + 1)];
+ pixel pix2 = src[(y + 1) * stride + (x + 0)];
+ pixel pix3 = src[(y + 1) * stride + (x + 1)];
+ int sum = pix0 + pix1 + pix2 + pix3;
+
+ dst[y / 2 * 32 + x / 2] = (pixel)((sum + 2) >> 2);
+ }
+ }
+}
+
} // end anonymous namespace
namespace x265 {
@@ -770,5 +803,8 @@
p.pixelsub_sp = pixelsub_sp_c;
p.pixeladd_pp = pixeladd_pp_c;
p.pixeladd_ss = pixeladd_ss_c;
+
+ p.scale1D_128to64 = scale1D_128to64;
+ p.scale2D_64to32 = scale2D_64to32;
}
}
diff -r 158239bec6c1 -r 18447bd07244 source/common/primitives.h
--- a/source/common/primitives.h Fri Jul 19 13:40:22 2013 -0500
+++ b/source/common/primitives.h Sun Jul 21 11:25:51 2013 +0800
@@ -226,6 +226,7 @@
pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height,
int marginX, int marginY, int w, int roundw, int shiftw, int offsetw);
typedef void (*weightpUni_t)(short *src, pixel *dst, int srcStride, int dstStride, int width, int height, int w0, int round, int shift, int offset);
+typedef void (*scale_t)(pixel *dst, pixel *src, intptr_t stride);
/* Define a structure containing function pointers to optimized encoder
* primitives. Each pointer can reference either an assembly routine,
@@ -282,6 +283,9 @@
filterVwghtd_t filterVwghtd;
filterHwghtd_t filterHwghtd;
+
+ scale_t scale1D_128to64;
+ scale_t scale2D_64to32;
};
/* This copy of the table is what gets used by the encoder.
diff -r 158239bec6c1 -r 18447bd07244 source/common/vec/pixel.inc
--- a/source/common/vec/pixel.inc Fri Jul 19 13:40:22 2013 -0500
+++ b/source/common/vec/pixel.inc Sun Jul 21 11:25:51 2013 +0800
@@ -391,5 +391,10 @@
p.transpose[4] = transpose<64>;
p.weightpUni = weightUnidir;
#endif
+
+#if !HIGH_BIT_DEPTH && (INSTRSET >= 4)
+ p.scale1D_128to64 = scale1D_128to64;
+ p.scale2D_64to32 = scale2D_64to32;
+#endif
}
}
diff -r 158239bec6c1 -r 18447bd07244 source/common/vec/pixel8.inc
--- a/source/common/vec/pixel8.inc Fri Jul 19 13:40:22 2013 -0500
+++ b/source/common/vec/pixel8.inc Sun Jul 21 11:25:51 2013 +0800
@@ -2371,5 +2371,75 @@
res[2] = horizontal_add(sum3);
res[3] = horizontal_add(sum4);
}
+#endif /* if INSTRSET >= 8 */
-#endif /* if INSTRSET >= 8 */
+#if INSTRSET >= 4
+void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
+{
+ const __m128i mask = _mm_setr_epi32(0x06040200, 0x0E0C0A08, 0x07050301, 0x0F0D0B09);
+
+ __m128i T00 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)&src[0 * 16]), mask);
+ __m128i T01 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)&src[1 * 16]), mask);
+ __m128i T02 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)&src[2 * 16]), mask);
+ __m128i T03 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)&src[3 * 16]), mask);
+ __m128i T04 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)&src[4 * 16]), mask);
+ __m128i T05 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)&src[5 * 16]), mask);
+ __m128i T06 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)&src[6 * 16]), mask);
+ __m128i T07 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)&src[7 * 16]), mask);
+
+ __m128i T10 = _mm_unpacklo_epi64(T00, T01);
+ __m128i T11 = _mm_unpackhi_epi64(T00, T01);
+ __m128i T12 = _mm_unpacklo_epi64(T02, T03);
+ __m128i T13 = _mm_unpackhi_epi64(T02, T03);
+ __m128i T14 = _mm_unpacklo_epi64(T04, T05);
+ __m128i T15 = _mm_unpackhi_epi64(T04, T05);
+ __m128i T16 = _mm_unpacklo_epi64(T06, T07);
+ __m128i T17 = _mm_unpackhi_epi64(T06, T07);
+
+ __m128i T20 = _mm_avg_epu8(T10, T11);
+ __m128i T21 = _mm_avg_epu8(T12, T13);
+ __m128i T22 = _mm_avg_epu8(T14, T15);
+ __m128i T23 = _mm_avg_epu8(T16, T17);
+
+ _mm_storeu_si128((__m128i*)&dst[ 0], T20);
+ _mm_storeu_si128((__m128i*)&dst[16], T21);
+ _mm_storeu_si128((__m128i*)&dst[32], T22);
+ _mm_storeu_si128((__m128i*)&dst[48], T23);
+}
+
+void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
+{
+ int i;
+ const __m128i c8_1 = _mm_set1_epi32(0x01010101);
+ const __m128i c16_2 = _mm_set1_epi32(0x00020002);
+
+ for (i = 0; i < 64; i += 2)
+ {
+ __m128i T00 = _mm_loadu_si128((__m128i*)&src[(i + 0) * stride + 0]);
+ __m128i T01 = _mm_loadu_si128((__m128i*)&src[(i + 0) * stride + 16]);
+ __m128i T02 = _mm_loadu_si128((__m128i*)&src[(i + 0) * stride + 32]);
+ __m128i T03 = _mm_loadu_si128((__m128i*)&src[(i + 0) * stride + 48]);
+ __m128i T10 = _mm_loadu_si128((__m128i*)&src[(i + 1) * stride + 0]);
+ __m128i T11 = _mm_loadu_si128((__m128i*)&src[(i + 1) * stride + 16]);
+ __m128i T12 = _mm_loadu_si128((__m128i*)&src[(i + 1) * stride + 32]);
+ __m128i T13 = _mm_loadu_si128((__m128i*)&src[(i + 1) * stride + 48]);
+
+ __m128i S00 = _mm_maddubs_epi16(T00, c8_1);
+ __m128i S01 = _mm_maddubs_epi16(T01, c8_1);
+ __m128i S02 = _mm_maddubs_epi16(T02, c8_1);
+ __m128i S03 = _mm_maddubs_epi16(T03, c8_1);
+ __m128i S10 = _mm_maddubs_epi16(T10, c8_1);
+ __m128i S11 = _mm_maddubs_epi16(T11, c8_1);
+ __m128i S12 = _mm_maddubs_epi16(T12, c8_1);
+ __m128i S13 = _mm_maddubs_epi16(T13, c8_1);
+
+ __m128i S20 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(S00, S10), c16_2), 2);
+ __m128i S21 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(S01, S11), c16_2), 2);
+ __m128i S22 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(S02, S12), c16_2), 2);
+ __m128i S23 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(S03, S13), c16_2), 2);
+
+ _mm_storeu_si128((__m128i*)&dst[(i >> 1) * 32 + 0], _mm_packus_epi16(S20, S21));
+ _mm_storeu_si128((__m128i*)&dst[(i >> 1) * 32 + 16], _mm_packus_epi16(S22, S23));
+ }
+}
+#endif /* if INSTRSET >= 4 */
diff -r 158239bec6c1 -r 18447bd07244 source/encoder/compress.cpp
--- a/source/encoder/compress.cpp Fri Jul 19 13:40:22 2013 -0500
+++ b/source/encoder/compress.cpp Sun Jul 21 11:25:51 2013 +0800
@@ -122,36 +122,36 @@
CandNum = 0;
UInt modeCosts[35];
- Bool bFilter = (width <= 16);
+ // CHM: TODO - The code isn't copy from TEncSearch::estIntraPredQT by me, I sync it, please check its logic
Pel *pAbove0 = m_search->refAbove + width - 1;
Pel *pAbove1 = m_search->refAboveFlt + width - 1;
Pel *pLeft0 = m_search->refLeft + width - 1;
Pel *pLeft1 = m_search->refLeftFlt + width - 1;
- Pel *pAbove = pAbove0;
- Pel *pLeft = pLeft0;
-
- // 1
- primitives.intra_pred_dc((pixel*)pAbove0 + 1, (pixel*)pLeft0 + 1, pred, stride, width, bFilter);
- modeCosts[DC_IDX] = sa8d(fenc, stride, pred, stride);
-
- // 0
- if (width >= 8 && width <= 32)
- {
- pAbove = pAbove1;
- pLeft = pLeft1;
- }
- primitives.intra_pred_planar(pAbove + 1, pLeft + 1, pred, stride, width);
- modeCosts[PLANAR_IDX] = sa8d(fenc, stride, pred, stride);
// 33 Angle modes once
- if (width <= 16)
+ ALIGN_VAR_32(Pel, buf_trans[32 * 32]);
+ ALIGN_VAR_32(Pel, tmp[33 * 32 * 32]);
+
+ if (width <= 32)
{
- ALIGN_VAR_32(Pel, buf1[MAX_CU_SIZE * MAX_CU_SIZE]);
- ALIGN_VAR_32(Pel, tmp[33 * MAX_CU_SIZE * MAX_CU_SIZE]);
+ // 1
+ primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, pred, stride, width, (width <= 16));
+ modeCosts[DC_IDX] = sa8d(fenc, stride, pred, stride);
+
+ // 0
+ Pel *above = pAbove0;
+ Pel *left = pLeft0;
+ if (width >= 8 && width <= 32)
+ {
+ above = pAbove1;
+ left = pLeft1;
+ }
+ primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, pred, stride, width);
+ modeCosts[PLANAR_IDX] = sa8d(fenc, stride, pred, stride);
// Transpose NxN
- x265::primitives.transpose[nLog2SizeMinus2](buf1, fenc, stride);
+ x265::primitives.transpose[nLog2SizeMinus2](buf_trans, (pixel*)fenc, stride);
x265::primitives.intra_pred_allangs[nLog2SizeMinus2](tmp, pAbove0, pLeft0, pAbove1, pLeft1, (width <= 16));
@@ -159,17 +159,40 @@
for (UInt mode = 2; mode < numModesAvailable; mode++)
{
bool modeHor = (mode < 18);
- Pel *src2 = (modeHor ? buf1 : fenc);
+ Pel *cmp = (modeHor ? buf_trans : fenc);
intptr_t srcStride = (modeHor ? width : stride);
- modeCosts[mode] = sa8d(src2, srcStride, &tmp[(mode - 2) * (width * width)], width);
+ modeCosts[mode] = sa8d(cmp, srcStride, &tmp[(mode - 2) * (width * width)], width);
}
}
else
{
+ ALIGN_VAR_32(Pel, buf_scale[32 * 32]);
+ x265::primitives.scale2D_64to32(buf_scale, fenc, stride);
+ x265::primitives.transpose[3](buf_trans, buf_scale, 32);
+
+ Pel above[2 * 32 + 1];
+ Pel left[2 * 32 + 1];
+
+ above[0] = left[0] = pAbove0[0];
+ x265::primitives.scale1D_128to64(above + 1, pAbove0 + 1, 0);
+ x265::primitives.scale1D_128to64(left + 1, pLeft0 + 1, 0);
+
+ // 1
+ primitives.intra_pred_dc(above + 1, left + 1, tmp, 32, 32, false);
+ modeCosts[DC_IDX] = 4 * x265::primitives.sa8d[3](buf_scale, 32, tmp, 32);
+
+ // 0
+ primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, tmp, 32, 32);
+ modeCosts[PLANAR_IDX] = 4 * x265::primitives.sa8d[3](buf_scale, 32, tmp, 32);
+
+ x265::primitives.intra_pred_allangs[3](tmp, above, left, above, left, false);
+
+ // TODO: I use 4 of SATD32x32 to replace real 64x64
for (UInt mode = 2; mode < numModesAvailable; mode++)
{
- m_search->predIntraLumaAng(mode, pred, stride, width);
- modeCosts[mode] = sa8d(fenc, stride, pred, stride);
+ bool modeHor = (mode < 18);
+ Pel *cmp_buf = (modeHor ? buf_trans : buf_scale);
+ modeCosts[mode] = 4 * x265::primitives.sa8d[3]((pixel*)cmp_buf, 32, (pixel*)&tmp[(mode - 2) * (32 * 32)], 32);
}
}
More information about the x265-devel
mailing list