[x265] [PATCH] asm: wrapper for findPosLast loop

Min Chen chenm003 at 163.com
Wed Mar 18 00:59:48 CET 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1426636780 25200
# Node ID 4747dea7f80510ad4799a073ffbaaa76be0a7182
# Parent  cc579ffe4b7e0cd2b554a573ba70ff8e8c726d42
asm: wrapper for findPosLast loop
---
 source/common/dct.cpp        |   35 ++++++++++++++++++
 source/common/primitives.h   |    4 ++
 source/encoder/entropy.cpp   |   28 +-------------
 source/test/pixelharness.cpp |   83 ++++++++++++++++++++++++++++++++++++++++++
 source/test/pixelharness.h   |    1 +
 5 files changed, 125 insertions(+), 26 deletions(-)

diff -r cc579ffe4b7e -r 4747dea7f805 source/common/dct.cpp
--- a/source/common/dct.cpp	Tue Mar 17 11:11:32 2015 +0530
+++ b/source/common/dct.cpp	Tue Mar 17 16:59:40 2015 -0700
@@ -752,6 +752,39 @@
     }
 }
 
+int findPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig)
+{
+    memset(coeffNum, 0, MLS_GRP_NUM * sizeof(*coeffNum));
+    memset(coeffFlag, 0, MLS_GRP_NUM * sizeof(*coeffFlag));
+    memset(coeffSign, 0, MLS_GRP_NUM * sizeof(*coeffSign));
+
+    int scanPosLast = 0;
+    do
+    {
+        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
+
+        const uint32_t posLast = scan[scanPosLast++];
+
+        const int curCoeff = coeff[posLast];
+        const uint32_t isNZCoeff = (curCoeff != 0);
+        // get L1 sig map
+        // NOTE: the new algorithm is complicated, so I keep reference code here
+        //uint32_t posy   = posLast >> log2TrSize;
+        //uint32_t posx   = posLast - (posy << log2TrSize);
+        //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
+        //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
+        //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
+        numSig -= isNZCoeff;
+
+        // TODO: optimize by instruction BTS
+        coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
+        coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
+        coeffNum[cgIdx] += (uint8_t)isNZCoeff;
+    }
+    while (numSig > 0);
+    return scanPosLast - 1;
+}
+
 }  // closing - anonymous file-static namespace
 
 namespace x265 {
@@ -783,5 +816,7 @@
     p.cu[BLOCK_8x8].copy_cnt   = copy_count<8>;
     p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
     p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
+
+    p.findPosLast = findPosLast_c;
 }
 }
diff -r cc579ffe4b7e -r 4747dea7f805 source/common/primitives.h
--- a/source/common/primitives.h	Tue Mar 17 11:11:32 2015 +0530
+++ b/source/common/primitives.h	Tue Mar 17 16:59:40 2015 -0700
@@ -179,6 +179,8 @@
 
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
 
+typedef int (*findPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
+
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -289,6 +291,8 @@
 
     filter_p2s_wxh_t      luma_p2s;
 
+    findPosLast_t         findPosLast;
+
     /* There is one set of chroma primitives per color space. An encoder will
      * have just a single color space and thus it will only ever use one entry
      * in this array. However we always fill all entries in the array in case
diff -r cc579ffe4b7e -r 4747dea7f805 source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp	Tue Mar 17 11:11:32 2015 +0530
+++ b/source/encoder/entropy.cpp	Tue Mar 17 16:59:40 2015 -0700
@@ -1451,9 +1451,6 @@
     uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
     uint16_t coeffSign[MLS_GRP_NUM];    // bit mask map for non-zero coeff sign
     uint16_t coeffFlag[MLS_GRP_NUM];    // bit mask map for non-zero coeff
-    memset(coeffNum, 0, sizeof(coeffNum));
-    memset(coeffFlag, 0, sizeof(coeffFlag));
-    memset(coeffSign, 0, sizeof(coeffSign));
 
     //----- encode significance map -----
 
@@ -1464,30 +1461,9 @@
     //const uint32_t maskPosXY = ((uint32_t)~0 >> (31 - log2TrSize + MLS_CG_LOG2_SIZE)) >> 1;
     X265_CHECK((uint32_t)((1 << (log2TrSize - MLS_CG_LOG2_SIZE)) - 1) == (((uint32_t)~0 >> (31 - log2TrSize + MLS_CG_LOG2_SIZE)) >> 1), "maskPosXY fault\n");
 
-    do
-    {
-        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
+    scanPosLast = primitives.findPosLast(codingParameters.scan, coeff, coeffSign, coeffFlag, coeffNum, numSig);
+    posLast = codingParameters.scan[scanPosLast];
 
-        posLast = codingParameters.scan[scanPosLast++];
-
-        const int curCoeff = coeff[posLast];
-        const uint32_t isNZCoeff = (curCoeff != 0);
-        // get L1 sig map
-        // NOTE: the new algorithm is complicated, so I keep reference code here
-        //uint32_t posy   = posLast >> log2TrSize;
-        //uint32_t posx   = posLast - (posy << log2TrSize);
-        //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
-        //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
-        //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
-        numSig -= isNZCoeff;
-
-        // TODO: optimize by instruction BTS
-        coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
-        coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
-        coeffNum[cgIdx] += (uint8_t)isNZCoeff;
-    }
-    while (numSig > 0);
-    scanPosLast--;
     const int lastScanSet = scanPosLast >> MLS_CG_SIZE;
 
     // Calculate CG block non-zero mask, the latest CG always flag as non-zero in CG scan loop
diff -r cc579ffe4b7e -r 4747dea7f805 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Mar 17 11:11:32 2015 +0530
+++ b/source/test/pixelharness.cpp	Tue Mar 17 16:59:40 2015 -0700
@@ -1149,6 +1149,71 @@
     return true;
 }
 
+bool PixelHarness::check_findPosLast(findPosLast_t ref, findPosLast_t opt)
+{
+    ALIGN_VAR_16(coeff_t, ref_src[32 * 32 + ITERS * 2]);
+    uint8_t ref_coeffNum[MLS_GRP_NUM], opt_coeffNum[MLS_GRP_NUM];      // value range[0, 16]
+    uint16_t ref_coeffSign[MLS_GRP_NUM], opt_coeffSign[MLS_GRP_NUM];    // bit mask map for non-zero coeff sign
+    uint16_t ref_coeffFlag[MLS_GRP_NUM], opt_coeffFlag[MLS_GRP_NUM];    // bit mask map for non-zero coeff
+
+    int totalCoeffs = 0;
+    for (int i = 0; i < 32 * 32; i++)
+    {
+        ref_src[i] = rand() & SHORT_MAX;
+        totalCoeffs += (ref_src[i] != 0);
+    }
+
+    // extra test area all of 0x1234
+    for (int i = 0; i < ITERS * 2; i++)
+    {
+        ref_src[32 * 32 + i] = 0x1234;
+    }
+    
+
+    memset(ref_coeffNum, 0xCD, sizeof(ref_coeffNum));
+    memset(ref_coeffSign, 0xCD, sizeof(ref_coeffSign));
+    memset(ref_coeffFlag, 0xCD, sizeof(ref_coeffFlag));
+
+    memset(opt_coeffNum, 0xCD, sizeof(opt_coeffNum));
+    memset(opt_coeffSign, 0xCD, sizeof(opt_coeffSign));
+    memset(opt_coeffFlag, 0xCD, sizeof(opt_coeffFlag));
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int rand_scan_type = rand() % NUM_SCAN_TYPE;
+        int rand_scan_size = rand() % NUM_SCAN_SIZE;
+        int rand_numCoeff = 0;
+
+        for (int j = 0; j < 1 << (2 * (rand_scan_size + 2)); j++)
+            rand_numCoeff += (ref_src[i + j] != 0);
+
+        const uint16_t* const scanTbl = g_scanOrder[rand_scan_type][rand_scan_size];
+
+        int ref_scanPos = ref(scanTbl, ref_src + i, ref_coeffSign, ref_coeffFlag, ref_coeffNum, rand_numCoeff);
+        int opt_scanPos = (int)checked(opt, scanTbl, ref_src + i, opt_coeffSign, opt_coeffFlag, opt_coeffNum, rand_numCoeff);
+
+        if (ref_scanPos != opt_scanPos)
+            return false;
+
+        for (int j = 0; rand_numCoeff; j++)
+        {
+            if (ref_coeffSign[j] != opt_coeffSign[j])
+                return false;
+
+            if (ref_coeffFlag[j] != opt_coeffFlag[j])
+                return false;
+
+            if (ref_coeffNum[j] != opt_coeffNum[j])
+                return false;
+
+            rand_numCoeff -= ref_coeffNum[j];
+        }
+
+        reportfail();
+    }
+
+    return true;
+}
 
 bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
@@ -1653,6 +1718,15 @@
         }
     }
 
+    if (opt.findPosLast)
+    {
+        if (!check_findPosLast(ref.findPosLast, opt.findPosLast))
+        {
+            printf("findPosLast failed!\n");
+            return false;
+        }
+    }
+
     return true;
 }
 
@@ -2003,4 +2077,13 @@
         HEADER0("propagateCost");
         REPORT_SPEEDUP(opt.propagateCost, ref.propagateCost, ibuf1, ushort_test_buff[0], int_test_buff[0], ushort_test_buff[0], int_test_buff[0], double_test_buff[0], 80);
     }
+
+    if (opt.findPosLast)
+    {
+        HEADER("findPosLast");
+        coeff_t coefBuf[32 * 32];
+        memset(coefBuf, 0, sizeof(coefBuf));
+        memset(coefBuf + 32 * 31, 1, 32 * sizeof(coeff_t));
+        REPORT_SPEEDUP(opt.findPosLast, ref.findPosLast, g_scanOrder[SCAN_DIAG][NUM_SCAN_SIZE - 1], coefBuf, (uint16_t*)sbuf1, (uint16_t*)sbuf2, (uint8_t*)psbuf1, 32);
+    }
 }
diff -r cc579ffe4b7e -r 4747dea7f805 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Tue Mar 17 11:11:32 2015 +0530
+++ b/source/test/pixelharness.h	Tue Mar 17 16:59:40 2015 -0700
@@ -104,6 +104,7 @@
     bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
     bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
     bool check_calSign(sign_t ref, sign_t opt);
+    bool check_findPosLast(findPosLast_t ref, findPosLast_t opt);
 
 public:
 



More information about the x265-devel mailing list