[x265] [PATCH] main10: create a hybrid all-angs primitve for 16bpp compiles

Steve Borho steve at borho.org
Fri Aug 8 06:48:11 CEST 2014


# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1407473053 18000
#      Thu Aug 07 23:44:13 2014 -0500
# Node ID 33702c567e506ec63a7a63878c54987d507feba5
# Parent  49b593197330ec74af5b4b5fffac7e56cd255b3e
main10: create a hybrid all-angs primitve for 16bpp compiles

The all-angs primitive is highly optimized assembly code that avoids a lot of
redundant work.  The all-angs C ref is horribly slow, doing redundant work to
mimic the output of the all-angs assembly code. Since we have no high bit depth
assembly for these functions, we'll use a shim C function that works very
similar to the C ref but it at least uses optimized primitives.

intra_allangs4x4	3.64x 	 6619.54  	 24097.30
intra_allangs8x8	5.66x 	 13722.49 	 77694.97
intra_allangs32x32	4.57x 	 246943.81 	 1129159.50

before:
encoded 1253 frames in 104.37s (12.01 fps), 366.08 kb/s, SSIM Mean Y: 0.9889624 (19.571 dB)

after:
encoded 1253 frames in 95.62s (13.10 fps), 366.08 kb/s, SSIM Mean Y: 0.9889624 (19.571 dB)

diff -r 49b593197330 -r 33702c567e50 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Aug 07 22:42:47 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp	Thu Aug 07 23:44:13 2014 -0500
@@ -1230,6 +1230,35 @@
 
 namespace x265 {
 // private x265 namespace
+
+#if HIGH_BIT_DEPTH
+extern unsigned char IntraFilterType[][35];
+
+/* Very similar to CRef in intrapred.cpp, except it uses optimized primitives */
+template<int log2Size>
+void intra_allangs(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma)
+{
+    const int size = 1 << log2Size;
+    const int sizeIdx = log2Size - 2;
+    ALIGN_VAR_32(pixel, buffer[32 * 32]);
+
+    for (int mode = 2; mode <= 34; mode++)
+    {
+        pixel *left = (IntraFilterType[sizeIdx][mode] ? left1 : left0);
+        pixel *above = (IntraFilterType[sizeIdx][mode] ? above1 : above0);
+        pixel *out = dest + ((mode - 2) << (log2Size * 2));
+
+        if (mode < 18)
+        {
+            primitives.intra_pred[sizeIdx][mode](buffer, size, left, above, mode, bLuma);
+            primitives.transpose[sizeIdx](out, buffer, size);
+        }
+        else
+            primitives.intra_pred[sizeIdx][mode](out, size, left, above, mode, bLuma);
+    }
+}
+#endif
+
 void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
 {
 #if HIGH_BIT_DEPTH
@@ -1434,6 +1463,14 @@
         p.chroma[X265_CSP_I422].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I422].copy_ss[i];
     }
 
+    if (p.intra_pred[0][0] && p.transpose[0])
+    {
+        p.intra_pred_allangs[BLOCK_4x4] = intra_allangs<2>;
+        p.intra_pred_allangs[BLOCK_8x8] = intra_allangs<3>;
+        p.intra_pred_allangs[BLOCK_16x16] = intra_allangs<4>;
+        p.intra_pred_allangs[BLOCK_32x32] = intra_allangs<5>;
+    }
+
 #else // if HIGH_BIT_DEPTH
     if (cpuMask & X265_CPU_SSE2)
     {
diff -r 49b593197330 -r 33702c567e50 source/test/testbench.cpp
--- a/source/test/testbench.cpp	Thu Aug 07 22:42:47 2014 -0500
+++ b/source/test/testbench.cpp	Thu Aug 07 23:44:13 2014 -0500
@@ -195,6 +195,7 @@
         EncoderPrimitives asmprim;
         memset(&asmprim, 0, sizeof(asmprim));
         Setup_Assembly_Primitives(asmprim, test_arch[i].flag);
+        memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
         {
             if (testname && strncmp(testname, harness[h]->getName(), strlen(testname)))
@@ -215,6 +216,11 @@
     Setup_Assembly_Primitives(optprim, cpuid);
     Setup_Alias_Primitives(optprim);
 
+    /* some hybrid primitives may rely on other primitives in the
+     * global primitive table, so set up those pointers. This is a
+     * bit ugly, but I don't see a better solution */
+    memcpy(&primitives, &optprim, sizeof(EncoderPrimitives));
+
     printf("\nTest performance improvement with full optimizations\n");
 
     for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)


More information about the x265-devel mailing list