[x265] [PATCH] psyCost_ss: C code optimization, suitable for ASM conversion

praveen at multicorewareinc.com praveen at multicorewareinc.com
Fri Dec 12 11:18:22 CET 2014


# HG changeset patch
# User Praveen Tiwari
# Date 1418379370 -19800
# Node ID a91546c2c96b1450792b19843c8dbc41191c1ef2
# Parent  ca805794c7519ef698776d0f717fd66b0d3bc626
psyCost_ss: C code optimization, suitable for ASM conversion

psy_ss_sum_std_4x4, is combined version of satd_4x4 and sad<4, 4>, eliminating unnecessary load
operations, extra arguments(zeroBuf and 0).It also replaces SAD operation with sum of absolute
value, as the second pixel buffer is zero.

diff -r ca805794c751 -r a91546c2c96b source/common/pixel.cpp
--- a/source/common/pixel.cpp	Fri Dec 12 15:03:32 2014 +0530
+++ b/source/common/pixel.cpp	Fri Dec 12 15:46:10 2014 +0530
@@ -866,6 +866,38 @@
     }
 }
 
+int psy_ss_sum_std_4x4(const int16_t* source, intptr_t sstride)
+{
+    sum2_t tmp[4][2];
+    sum2_t a0, a1, a2, a3, b0, b1;
+    sum2_t sum = 0;
+    int sum0 = 0;
+    uint32_t sum1 = 0;
+
+    for (int i = 0; i < 4; i++, source += sstride)
+    {
+        a0 = source[0];
+        a1 = source[1];
+        a2 = source[2];
+        a3 = source[3];
+        sum1 = sum1 + (abs(source[0]) + abs(source[1]) + abs(source[2]) + abs(source[3]));
+        b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
+        b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
+        tmp[i][0] = b0 + b1;
+        tmp[i][1] = b0 - b1;
+    }
+
+    for (int i = 0; i < 2; i++)
+    {
+        HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+        a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
+        sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
+    }
+
+    sum0 = (int)(sum >> 1);
+    return (int)(sum0 - (sum1 >> 2));
+}
+
 template<int size>
 int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
 {
@@ -893,8 +925,8 @@
     else
     {
         /* 4x4 is too small for sa8d */
-        int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
-        int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
+        int sourceEnergy = psy_ss_sum_std_4x4(source, sstride);
+        int reconEnergy = psy_ss_sum_std_4x4(recon, rstride);
         return abs(sourceEnergy - reconEnergy);
     }
 }


More information about the x265-devel mailing list