[x265] [PATCH] Improvement Intra 64x64
chen
chenm003 at 163.com
Sat Jul 20 13:53:20 CEST 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1374320919 -28800
# Node ID b03337eab5e676a9be218b157b0d840302f80574
# Parent 158239bec6c14e44a605937f5ad6df56b0ceacec
intrapred: Improvement Inra 64x64
diff -r 158239bec6c1 -r b03337eab5e6 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Fri Jul 19 13:40:22 2013 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Sat Jul 20 19:48:39 2013 +0800
@@ -1991,13 +1991,13 @@
modeCosts[PLANAR_IDX] = sa8d(fenc, stride, pred, stride);
// 33 Angle modes once
+ ALIGN_VAR_32(Pel, buf_trans[32 * 32]);
+ ALIGN_VAR_32(Pel, tmp[33 * 32 * 32]);
+
if (width <= 32)
{
- ALIGN_VAR_32(Pel, buf1[MAX_CU_SIZE * MAX_CU_SIZE]);
- ALIGN_VAR_32(Pel, tmp[33 * MAX_CU_SIZE * MAX_CU_SIZE]);
-
// Transpose NxN
- x265::primitives.transpose[log2SizeMinus2](buf1, fenc, stride);
+ x265::primitives.transpose[log2SizeMinus2](buf_trans, (pixel*)fenc, stride);
x265::primitives.intra_pred_allangs[log2SizeMinus2](tmp, pAbove0, pLeft0, pAbove1, pLeft1, (width <= 16));
@@ -2005,18 +2005,46 @@
for (UInt mode = 2; mode < numModesAvailable; mode++)
{
bool modeHor = (mode < 18);
- Pel *cmp = (modeHor ? buf1 : fenc);
+ Pel *cmp = (modeHor ? buf_trans : fenc);
intptr_t srcStride = (modeHor ? width : stride);
modeCosts[mode] = sa8d(cmp, srcStride, &tmp[(mode - 2) * (width * width)], width);
}
}
else
{
+ // width is 64
+ // TODO: make as option
+#if 1
+ ALIGN_VAR_32(Pel, buf_scale[32 * 32]);
+ x265::primitives.scale64to32(buf_scale, fenc, stride);
+ x265::primitives.transpose[3](buf_trans, buf_scale, 32);
+
+ Pel above[2 * 32 + 1];
+ Pel left[2 * 32 + 1];
+
+ above[0] = left[0] = pAbove0[0];
+ for (int i = 0; i < 32 * 2; i++)
+ {
+ above[1 + i] = (pAbove0[1 + i * 2] + pAbove0[1 + i * 2 + 1] + 1) >> 1;
+ left[1 + i] = (pLeft0[1 + i * 2] + pLeft0[1 + i * 2 + 1] + 1) >> 1;
+ }
+
+ x265::primitives.intra_pred_allangs[3]((pixel*)tmp, (pixel*)above, (pixel*)left, (pixel*)above, (pixel*)left, false);
+
+ // TODO: I use 4 of SATD32x32 to replace real 64x64, is it mistake to DC and Planar SATD64x64?
+ for (UInt mode = 2; mode < numModesAvailable; mode++)
+ {
+ bool modeHor = (mode < 18);
+ Pel *cmp = (modeHor ? buf_trans : buf_scale);
+ modeCosts[mode] = 4 * x265::primitives.sa8d[3]((pixel*)cmp, 32, (pixel*)&tmp[(mode - 2) * (32 * 32)], 32);
+ }
+#else
for (UInt mode = 2; mode < numModesAvailable; mode++)
{
predIntraLumaAng(mode, pred, stride, width);
modeCosts[mode] = sa8d(fenc, stride, pred, stride);
}
+#endif
}
// Find N least cost modes. N = numModesForFullRD
diff -r 158239bec6c1 -r b03337eab5e6 source/common/pixel.cpp
--- a/source/common/pixel.cpp Fri Jul 19 13:40:22 2013 -0500
+++ b/source/common/pixel.cpp Sat Jul 20 19:48:39 2013 +0800
@@ -565,6 +565,25 @@
}
}
+void scale64to32(pixel *dst, pixel *src, intptr_t stride)
+{
+ int x, y;
+
+ for (y = 0; y < 64; y += 2)
+ {
+ for (x = 0; x < 64; x += 2)
+ {
+ pixel pix0 = src[(y + 0) * stride + (x + 0)];
+ pixel pix1 = src[(y + 0) * stride + (x + 1)];
+ pixel pix2 = src[(y + 1) * stride + (x + 0)];
+ pixel pix3 = src[(y + 1) * stride + (x + 1)];
+ int sum = pix0 + pix1 + pix2 + pix3;
+
+ dst[y / 2 * 32 + x / 2] = (pixel)((sum + 2) >> 2);
+ }
+ }
+}
+
} // end anonymous namespace
namespace x265 {
@@ -770,5 +789,7 @@
p.pixelsub_sp = pixelsub_sp_c;
p.pixeladd_pp = pixeladd_pp_c;
p.pixeladd_ss = pixeladd_ss_c;
+
+ p.scale64to32 = scale64to32;
}
}
diff -r 158239bec6c1 -r b03337eab5e6 source/common/primitives.h
--- a/source/common/primitives.h Fri Jul 19 13:40:22 2013 -0500
+++ b/source/common/primitives.h Sat Jul 20 19:48:39 2013 +0800
@@ -226,6 +226,7 @@
pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height,
int marginX, int marginY, int w, int roundw, int shiftw, int offsetw);
typedef void (*weightpUni_t)(short *src, pixel *dst, int srcStride, int dstStride, int width, int height, int w0, int round, int shift, int offset);
+typedef void (*scale_t)(pixel *dst, pixel *src, intptr_t stride);
/* Define a structure containing function pointers to optimized encoder
* primitives. Each pointer can reference either an assembly routine,
@@ -282,6 +283,8 @@
filterVwghtd_t filterVwghtd;
filterHwghtd_t filterHwghtd;
+
+ scale_t scale64to32;
};
/* This copy of the table is what gets used by the encoder.
diff -r 158239bec6c1 -r b03337eab5e6 source/common/vec/pixel.inc
--- a/source/common/vec/pixel.inc Fri Jul 19 13:40:22 2013 -0500
+++ b/source/common/vec/pixel.inc Sat Jul 20 19:48:39 2013 +0800
@@ -391,5 +391,9 @@
p.transpose[4] = transpose<64>;
p.weightpUni = weightUnidir;
#endif
+
+#if !HIGH_BIT_DEPTH && (INSTRSET >= 4)
+ p.scale64to32 = scale64to32;
+#endif
}
}
diff -r 158239bec6c1 -r b03337eab5e6 source/common/vec/pixel8.inc
--- a/source/common/vec/pixel8.inc Fri Jul 19 13:40:22 2013 -0500
+++ b/source/common/vec/pixel8.inc Sat Jul 20 19:48:39 2013 +0800
@@ -2371,5 +2371,42 @@
res[2] = horizontal_add(sum3);
res[3] = horizontal_add(sum4);
}
+#endif /* if INSTRSET >= 8 */
-#endif /* if INSTRSET >= 8 */
+#if INSTRSET >= 4
+void scale64to32(pixel *dst, pixel *src, intptr_t stride)
+{
+ int i;
+ const __m128i c8_1 = _mm_set1_epi32(0x01010101);
+ const __m128i c16_2 = _mm_set1_epi32(0x00020002);
+
+ for (i = 0; i < 64; i += 2)
+ {
+ __m128i T00 = _mm_loadu_si128((__m128i*)&src[(i + 0) * stride + 0]);
+ __m128i T01 = _mm_loadu_si128((__m128i*)&src[(i + 0) * stride + 16]);
+ __m128i T02 = _mm_loadu_si128((__m128i*)&src[(i + 0) * stride + 32]);
+ __m128i T03 = _mm_loadu_si128((__m128i*)&src[(i + 0) * stride + 48]);
+ __m128i T10 = _mm_loadu_si128((__m128i*)&src[(i + 1) * stride + 0]);
+ __m128i T11 = _mm_loadu_si128((__m128i*)&src[(i + 1) * stride + 16]);
+ __m128i T12 = _mm_loadu_si128((__m128i*)&src[(i + 1) * stride + 32]);
+ __m128i T13 = _mm_loadu_si128((__m128i*)&src[(i + 1) * stride + 48]);
+
+ __m128i S00 = _mm_maddubs_epi16(T00, c8_1);
+ __m128i S01 = _mm_maddubs_epi16(T01, c8_1);
+ __m128i S02 = _mm_maddubs_epi16(T02, c8_1);
+ __m128i S03 = _mm_maddubs_epi16(T03, c8_1);
+ __m128i S10 = _mm_maddubs_epi16(T10, c8_1);
+ __m128i S11 = _mm_maddubs_epi16(T11, c8_1);
+ __m128i S12 = _mm_maddubs_epi16(T12, c8_1);
+ __m128i S13 = _mm_maddubs_epi16(T13, c8_1);
+
+ __m128i S20 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(S00, S10), c16_2), 2);
+ __m128i S21 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(S01, S11), c16_2), 2);
+ __m128i S22 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(S02, S12), c16_2), 2);
+ __m128i S23 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(S03, S13), c16_2), 2);
+
+ _mm_storeu_si128((__m128i*)&dst[(i >> 1) * 32 + 0], _mm_packus_epi16(S20, S21));
+ _mm_storeu_si128((__m128i*)&dst[(i >> 1) * 32 + 16], _mm_packus_epi16(S22, S23));
+ }
+}
+#endif /* if INSTRSET >= 4 */
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/private/x265-devel/attachments/20130720/7e76ca5b/attachment.html>
More information about the x265-devel
mailing list