[x265] [PATCH] testbench: added new optimized c primitive for psyCost_pp, suitable to write asm code

Mon Dec 15 09:47:35 CET 2014

# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1418633185 -19800
#      Mon Dec 15 14:16:25 2014 +0530
# Node ID ff352d647f4b3a8f0c249fc7a8f4eb3645aaa974
# Parent  6ba7be7b169783db1d667d1140e51b68ff4b64fb
testbench: added new optimized c primitive for psyCost_pp, suitable to write asm code

in new primitive, combined sa8d_8x8 and sad_8x8 together to save redundant loads, removed unnecessary zeroBuffer
testbench checks old c vs new c code correctness

diff -r 6ba7be7b1697 -r ff352d647f4b source/common/pixel.cpp

--- a/source/common/pixel.cpp	Sat Dec 13 01:03:19 2014 -0600
+++ b/source/common/pixel.cpp	Mon Dec 15 14:16:25 2014 +0530
@@ -801,6 +801,117 @@
     }
 }
 
+void psy_acEnergy_pp_8x8(const pixel* src, intptr_t stride, int* energy, int dim)
+{
+    int n = 0;
+    const pixel* tmpSrc = src;
+
+    for (int k = 0; k < dim; k += 8)
+    {
+        for (int j = 0; j < dim; j += 8)
+        {
+            src = tmpSrc + k * stride + j;
+            ssum2_t tmp[8][4];
+            ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
+            ssum2_t sum = 0, sum1 = 0;
+
+            for (int i = 0; i < 8; i++, src += stride)
+            {
+                a0 = src[0];
+                a1 = src[1];
+                b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
+                a2 = src[2];
+                a3 = src[3];
+                b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
+                a4 = src[4];
+                a5 = src[5];
+                b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM);
+                a6 = src[6];
+                a7 = src[7];
+                sum1 += a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7;
+                b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM);
+                HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3);
+            }
+
+            for (int i = 0; i < 4; i++)
+            {
+                HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+                HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]);
+                b0  = abs2(a0 + a4) + abs2(a0 - a4);
+                b0 += abs2(a1 + a5) + abs2(a1 - a5);
+                b0 += abs2(a2 + a6) + abs2(a2 - a6);
+                b0 += abs2(a3 + a7) + abs2(a3 - a7);
+                sum += (sum_t)b0 + (b0 >> BITS_PER_SUM);
+            }
+
+            sum = (int)((sum + 2) >> 2);
+            sum1 >>= 2;
+
+            energy[n++] = (sum - sum1);
+        }
+    }
+}
+
+int psy_acEnergy_pp_4x4(const pixel* pix1, intptr_t stride_pix1)
+{
+    sum2_t tmp[4][2];
+    sum2_t a0, a1, a2, a3, b0, b1;
+    sum2_t sum = 0;
+    sum2_t sum1 = 0;
+
+    for (int i = 0; i < 4; i++, pix1 += stride_pix1)
+    {
+        a0 = pix1[0];
+        a1 = pix1[1];
+        a2 = pix1[2];
+        a3 = pix1[3];
+        sum1 = sum1 + (a0 + a1 + a2 + a3);
+        b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
+        b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
+        tmp[i][0] = b0 + b1;
+        tmp[i][1] = b0 - b1;
+    }
+
+    for (int i = 0; i < 2; i++)
+    {
+        HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+        a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
+        sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
+    }
+
+    sum = (int)(sum >> 1);
+    return (int)(sum - (sum1 >> 2));
+}
+
+template<int size>
+int psyCost_pp_opt(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
+{
+    if (size)
+    {
+        int dim     = 1 << (size + 2);
+        int bufSize = dim >> (4 - size);
+        uint32_t totEnergy = 0;
+        int sourceEnergy[64] = {0};
+        int reconEnergy[64]  = {0};
+
+        /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
+        psy_acEnergy_pp_8x8(source, sstride, sourceEnergy, dim);
+        psy_acEnergy_pp_8x8(recon,  rstride, reconEnergy,  dim);
+
+        for (int i = 0; i < bufSize; i++)
+            totEnergy += abs(sourceEnergy[i] - reconEnergy[i]);
+
+        return totEnergy;
+    }
+    else
+    {
+        /* 4x4 is too small for sa8d */
+        int sourceEnergy = psy_acEnergy_pp_4x4(source, sstride);
+        int reconEnergy = psy_acEnergy_pp_4x4(recon, rstride);
+        return abs(sourceEnergy - reconEnergy);
+    }
+}
+
 template<int size>
 int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
 {
@@ -1297,6 +1408,13 @@
     p.psy_cost_pp[BLOCK_32x32] = psyCost_pp<BLOCK_32x32>;
     p.psy_cost_pp[BLOCK_64x64] = psyCost_pp<BLOCK_64x64>;
 
+    // psy_cost_pp[NUM_SQUARE_BLOCKS to NUM_SQUARE_BLOCKS*2] new optmized c code function assignment
+    p.psy_cost_pp[BLOCK_4x4 + NUM_SQUARE_BLOCKS]   = psyCost_pp_opt<BLOCK_4x4>;
+    p.psy_cost_pp[BLOCK_8x8 + NUM_SQUARE_BLOCKS]   = psyCost_pp_opt<BLOCK_8x8>;
+    p.psy_cost_pp[BLOCK_16x16 + NUM_SQUARE_BLOCKS] = psyCost_pp_opt<BLOCK_16x16>;
+    p.psy_cost_pp[BLOCK_32x32 + NUM_SQUARE_BLOCKS] = psyCost_pp_opt<BLOCK_32x32>;
+    p.psy_cost_pp[BLOCK_64x64 + NUM_SQUARE_BLOCKS] = psyCost_pp_opt<BLOCK_64x64>;
+
     p.psy_cost_ss[BLOCK_4x4] = psyCost_ss<BLOCK_4x4>;
     p.psy_cost_ss[BLOCK_8x8] = psyCost_ss<BLOCK_8x8>;
     p.psy_cost_ss[BLOCK_16x16] = psyCost_ss<BLOCK_16x16>;
diff -r 6ba7be7b1697 -r ff352d647f4b source/common/primitives.h
--- a/source/common/primitives.h	Sat Dec 13 01:03:19 2014 -0600
+++ b/source/common/primitives.h	Mon Dec 15 14:16:25 2014 +0530
@@ -211,7 +211,11 @@
     pixelcmp_t            satd[NUM_LUMA_PARTITIONS];       // Sum of Transformed differences (HADAMARD)
     pixelcmp_t            sa8d_inter[NUM_LUMA_PARTITIONS]; // sa8d primitives for motion search partitions
     pixelcmp_t            sa8d[NUM_SQUARE_BLOCKS];         // sa8d primitives for square intra blocks
-    pixelcmp_t            psy_cost_pp[NUM_SQUARE_BLOCKS];  // difference in AC energy between two blocks
+
+    // psy_cost_pp[0 to NUM_SQUARE_BLOCKS] old c code function assignment
+    // psy_cost_pp[NUM_SQUARE_BLOCKS to NUM_SQUARE_BLOCKS*2] new optmized c code function assignment
+    // TO-DO: once asm code completed, we can remove * 2
+    pixelcmp_t            psy_cost_pp[NUM_SQUARE_BLOCKS * 2];  // difference in AC energy between two blocks
     pixelcmp_ss_t         psy_cost_ss[NUM_SQUARE_BLOCKS];
 
     dct_t                 dct[NUM_DCTS];
diff -r 6ba7be7b1697 -r ff352d647f4b source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Sat Dec 13 01:03:19 2014 -0600
+++ b/source/test/pixelharness.cpp	Mon Dec 15 14:16:25 2014 +0530
@@ -948,6 +948,29 @@
     return true;
 }
 
+bool PixelHarness::check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt)
+{
+    int j = 0;
+    intptr_t stride = STRIDE;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int index1 = rand() % TEST_CASES;
+        int index2 = rand() % TEST_CASES;
+
+        int optres = (int)checked(opt, uchar_test_buff[index1] + j, stride, uchar_test_buff[index2] + j, stride);
+        int refres = ref(uchar_test_buff[index1] + j, stride, uchar_test_buff[index2] + j, stride);
+
+        if (optres != refres)
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     if (opt.satd[part])
@@ -1290,6 +1313,15 @@
                 return false;
             }
         }
+
+        if (ref.psy_cost_pp[i])
+        {
+            if (!check_psyCost_pp(ref.psy_cost_pp[i], ref.psy_cost_pp[NUM_SQUARE_BLOCKS + i]))
+            {
+                printf("\npsyCost_pp[%dx%d] failed!\n", 4 << i, 4 << i);
+                return false;
+            }
+        }
     }
 
     if (opt.weight_pp)
diff -r 6ba7be7b1697 -r ff352d647f4b source/test/pixelharness.h
--- a/source/test/pixelharness.h	Sat Dec 13 01:03:19 2014 -0600
+++ b/source/test/pixelharness.h	Mon Dec 15 14:16:25 2014 +0530
@@ -92,6 +92,7 @@
     bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
+    bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
 
 public: