[x265] [PATCH] main10: create a hybrid all-angs primitve for 16bpp compiles
Steve Borho
steve at borho.org
Fri Aug 8 06:48:11 CEST 2014
# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1407473053 18000
# Thu Aug 07 23:44:13 2014 -0500
# Node ID 33702c567e506ec63a7a63878c54987d507feba5
# Parent 49b593197330ec74af5b4b5fffac7e56cd255b3e
main10: create a hybrid all-angs primitve for 16bpp compiles
The all-angs primitive is highly optimized assembly code that avoids a lot of
redundant work. The all-angs C ref is horribly slow, doing redundant work to
mimic the output of the all-angs assembly code. Since we have no high bit depth
assembly for these functions, we'll use a shim C function that works very
similar to the C ref but it at least uses optimized primitives.
intra_allangs4x4 3.64x 6619.54 24097.30
intra_allangs8x8 5.66x 13722.49 77694.97
intra_allangs32x32 4.57x 246943.81 1129159.50
before:
encoded 1253 frames in 104.37s (12.01 fps), 366.08 kb/s, SSIM Mean Y: 0.9889624 (19.571 dB)
after:
encoded 1253 frames in 95.62s (13.10 fps), 366.08 kb/s, SSIM Mean Y: 0.9889624 (19.571 dB)
diff -r 49b593197330 -r 33702c567e50 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Aug 07 22:42:47 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp Thu Aug 07 23:44:13 2014 -0500
@@ -1230,6 +1230,35 @@
namespace x265 {
// private x265 namespace
+
+#if HIGH_BIT_DEPTH
+extern unsigned char IntraFilterType[][35];
+
+/* Very similar to CRef in intrapred.cpp, except it uses optimized primitives */
+template<int log2Size>
+void intra_allangs(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma)
+{
+ const int size = 1 << log2Size;
+ const int sizeIdx = log2Size - 2;
+ ALIGN_VAR_32(pixel, buffer[32 * 32]);
+
+ for (int mode = 2; mode <= 34; mode++)
+ {
+ pixel *left = (IntraFilterType[sizeIdx][mode] ? left1 : left0);
+ pixel *above = (IntraFilterType[sizeIdx][mode] ? above1 : above0);
+ pixel *out = dest + ((mode - 2) << (log2Size * 2));
+
+ if (mode < 18)
+ {
+ primitives.intra_pred[sizeIdx][mode](buffer, size, left, above, mode, bLuma);
+ primitives.transpose[sizeIdx](out, buffer, size);
+ }
+ else
+ primitives.intra_pred[sizeIdx][mode](out, size, left, above, mode, bLuma);
+ }
+}
+#endif
+
void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
{
#if HIGH_BIT_DEPTH
@@ -1434,6 +1463,14 @@
p.chroma[X265_CSP_I422].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I422].copy_ss[i];
}
+ if (p.intra_pred[0][0] && p.transpose[0])
+ {
+ p.intra_pred_allangs[BLOCK_4x4] = intra_allangs<2>;
+ p.intra_pred_allangs[BLOCK_8x8] = intra_allangs<3>;
+ p.intra_pred_allangs[BLOCK_16x16] = intra_allangs<4>;
+ p.intra_pred_allangs[BLOCK_32x32] = intra_allangs<5>;
+ }
+
#else // if HIGH_BIT_DEPTH
if (cpuMask & X265_CPU_SSE2)
{
diff -r 49b593197330 -r 33702c567e50 source/test/testbench.cpp
--- a/source/test/testbench.cpp Thu Aug 07 22:42:47 2014 -0500
+++ b/source/test/testbench.cpp Thu Aug 07 23:44:13 2014 -0500
@@ -195,6 +195,7 @@
EncoderPrimitives asmprim;
memset(&asmprim, 0, sizeof(asmprim));
Setup_Assembly_Primitives(asmprim, test_arch[i].flag);
+ memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
{
if (testname && strncmp(testname, harness[h]->getName(), strlen(testname)))
@@ -215,6 +216,11 @@
Setup_Assembly_Primitives(optprim, cpuid);
Setup_Alias_Primitives(optprim);
+ /* some hybrid primitives may rely on other primitives in the
+ * global primitive table, so set up those pointers. This is a
+ * bit ugly, but I don't see a better solution */
+ memcpy(&primitives, &optprim, sizeof(EncoderPrimitives));
+
printf("\nTest performance improvement with full optimizations\n");
for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
More information about the x265-devel
mailing list