[x265] [PATCH] asm: wrapper for findPosLast loop
Min Chen
chenm003 at 163.com
Wed Mar 18 00:59:48 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1426636780 25200
# Node ID 4747dea7f80510ad4799a073ffbaaa76be0a7182
# Parent cc579ffe4b7e0cd2b554a573ba70ff8e8c726d42
asm: wrapper for findPosLast loop
---
source/common/dct.cpp | 35 ++++++++++++++++++
source/common/primitives.h | 4 ++
source/encoder/entropy.cpp | 28 +-------------
source/test/pixelharness.cpp | 83 ++++++++++++++++++++++++++++++++++++++++++
source/test/pixelharness.h | 1 +
5 files changed, 125 insertions(+), 26 deletions(-)
diff -r cc579ffe4b7e -r 4747dea7f805 source/common/dct.cpp
--- a/source/common/dct.cpp Tue Mar 17 11:11:32 2015 +0530
+++ b/source/common/dct.cpp Tue Mar 17 16:59:40 2015 -0700
@@ -752,6 +752,39 @@
}
}
+int findPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig)
+{
+ memset(coeffNum, 0, MLS_GRP_NUM * sizeof(*coeffNum));
+ memset(coeffFlag, 0, MLS_GRP_NUM * sizeof(*coeffFlag));
+ memset(coeffSign, 0, MLS_GRP_NUM * sizeof(*coeffSign));
+
+ int scanPosLast = 0;
+ do
+ {
+ const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
+
+ const uint32_t posLast = scan[scanPosLast++];
+
+ const int curCoeff = coeff[posLast];
+ const uint32_t isNZCoeff = (curCoeff != 0);
+ // get L1 sig map
+ // NOTE: the new algorithm is complicated, so I keep reference code here
+ //uint32_t posy = posLast >> log2TrSize;
+ //uint32_t posx = posLast - (posy << log2TrSize);
+ //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
+ //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
+ //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
+ numSig -= isNZCoeff;
+
+ // TODO: optimize by instruction BTS
+ coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
+ coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
+ coeffNum[cgIdx] += (uint8_t)isNZCoeff;
+ }
+ while (numSig > 0);
+ return scanPosLast - 1;
+}
+
} // closing - anonymous file-static namespace
namespace x265 {
@@ -783,5 +816,7 @@
p.cu[BLOCK_8x8].copy_cnt = copy_count<8>;
p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
+
+ p.findPosLast = findPosLast_c;
}
}
diff -r cc579ffe4b7e -r 4747dea7f805 source/common/primitives.h
--- a/source/common/primitives.h Tue Mar 17 11:11:32 2015 +0530
+++ b/source/common/primitives.h Tue Mar 17 16:59:40 2015 -0700
@@ -179,6 +179,8 @@
typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
+typedef int (*findPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
+
/* Function pointers to optimized encoder primitives. Each pointer can reference
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
struct EncoderPrimitives
@@ -289,6 +291,8 @@
filter_p2s_wxh_t luma_p2s;
+ findPosLast_t findPosLast;
+
/* There is one set of chroma primitives per color space. An encoder will
* have just a single color space and thus it will only ever use one entry
* in this array. However we always fill all entries in the array in case
diff -r cc579ffe4b7e -r 4747dea7f805 source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp Tue Mar 17 11:11:32 2015 +0530
+++ b/source/encoder/entropy.cpp Tue Mar 17 16:59:40 2015 -0700
@@ -1451,9 +1451,6 @@
uint8_t coeffNum[MLS_GRP_NUM]; // value range[0, 16]
uint16_t coeffSign[MLS_GRP_NUM]; // bit mask map for non-zero coeff sign
uint16_t coeffFlag[MLS_GRP_NUM]; // bit mask map for non-zero coeff
- memset(coeffNum, 0, sizeof(coeffNum));
- memset(coeffFlag, 0, sizeof(coeffFlag));
- memset(coeffSign, 0, sizeof(coeffSign));
//----- encode significance map -----
@@ -1464,30 +1461,9 @@
//const uint32_t maskPosXY = ((uint32_t)~0 >> (31 - log2TrSize + MLS_CG_LOG2_SIZE)) >> 1;
X265_CHECK((uint32_t)((1 << (log2TrSize - MLS_CG_LOG2_SIZE)) - 1) == (((uint32_t)~0 >> (31 - log2TrSize + MLS_CG_LOG2_SIZE)) >> 1), "maskPosXY fault\n");
- do
- {
- const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
+ scanPosLast = primitives.findPosLast(codingParameters.scan, coeff, coeffSign, coeffFlag, coeffNum, numSig);
+ posLast = codingParameters.scan[scanPosLast];
- posLast = codingParameters.scan[scanPosLast++];
-
- const int curCoeff = coeff[posLast];
- const uint32_t isNZCoeff = (curCoeff != 0);
- // get L1 sig map
- // NOTE: the new algorithm is complicated, so I keep reference code here
- //uint32_t posy = posLast >> log2TrSize;
- //uint32_t posx = posLast - (posy << log2TrSize);
- //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
- //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
- //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
- numSig -= isNZCoeff;
-
- // TODO: optimize by instruction BTS
- coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
- coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
- coeffNum[cgIdx] += (uint8_t)isNZCoeff;
- }
- while (numSig > 0);
- scanPosLast--;
const int lastScanSet = scanPosLast >> MLS_CG_SIZE;
// Calculate CG block non-zero mask, the latest CG always flag as non-zero in CG scan loop
diff -r cc579ffe4b7e -r 4747dea7f805 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Mar 17 11:11:32 2015 +0530
+++ b/source/test/pixelharness.cpp Tue Mar 17 16:59:40 2015 -0700
@@ -1149,6 +1149,71 @@
return true;
}
+bool PixelHarness::check_findPosLast(findPosLast_t ref, findPosLast_t opt)
+{
+ ALIGN_VAR_16(coeff_t, ref_src[32 * 32 + ITERS * 2]);
+ uint8_t ref_coeffNum[MLS_GRP_NUM], opt_coeffNum[MLS_GRP_NUM]; // value range[0, 16]
+ uint16_t ref_coeffSign[MLS_GRP_NUM], opt_coeffSign[MLS_GRP_NUM]; // bit mask map for non-zero coeff sign
+ uint16_t ref_coeffFlag[MLS_GRP_NUM], opt_coeffFlag[MLS_GRP_NUM]; // bit mask map for non-zero coeff
+
+ int totalCoeffs = 0;
+ for (int i = 0; i < 32 * 32; i++)
+ {
+ ref_src[i] = rand() & SHORT_MAX;
+ totalCoeffs += (ref_src[i] != 0);
+ }
+
+ // extra test area all of 0x1234
+ for (int i = 0; i < ITERS * 2; i++)
+ {
+ ref_src[32 * 32 + i] = 0x1234;
+ }
+
+
+ memset(ref_coeffNum, 0xCD, sizeof(ref_coeffNum));
+ memset(ref_coeffSign, 0xCD, sizeof(ref_coeffSign));
+ memset(ref_coeffFlag, 0xCD, sizeof(ref_coeffFlag));
+
+ memset(opt_coeffNum, 0xCD, sizeof(opt_coeffNum));
+ memset(opt_coeffSign, 0xCD, sizeof(opt_coeffSign));
+ memset(opt_coeffFlag, 0xCD, sizeof(opt_coeffFlag));
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int rand_scan_type = rand() % NUM_SCAN_TYPE;
+ int rand_scan_size = rand() % NUM_SCAN_SIZE;
+ int rand_numCoeff = 0;
+
+ for (int j = 0; j < 1 << (2 * (rand_scan_size + 2)); j++)
+ rand_numCoeff += (ref_src[i + j] != 0);
+
+ const uint16_t* const scanTbl = g_scanOrder[rand_scan_type][rand_scan_size];
+
+ int ref_scanPos = ref(scanTbl, ref_src + i, ref_coeffSign, ref_coeffFlag, ref_coeffNum, rand_numCoeff);
+ int opt_scanPos = (int)checked(opt, scanTbl, ref_src + i, opt_coeffSign, opt_coeffFlag, opt_coeffNum, rand_numCoeff);
+
+ if (ref_scanPos != opt_scanPos)
+ return false;
+
+ for (int j = 0; rand_numCoeff; j++)
+ {
+ if (ref_coeffSign[j] != opt_coeffSign[j])
+ return false;
+
+ if (ref_coeffFlag[j] != opt_coeffFlag[j])
+ return false;
+
+ if (ref_coeffNum[j] != opt_coeffNum[j])
+ return false;
+
+ rand_numCoeff -= ref_coeffNum[j];
+ }
+
+ reportfail();
+ }
+
+ return true;
+}
bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
@@ -1653,6 +1718,15 @@
}
}
+ if (opt.findPosLast)
+ {
+ if (!check_findPosLast(ref.findPosLast, opt.findPosLast))
+ {
+ printf("findPosLast failed!\n");
+ return false;
+ }
+ }
+
return true;
}
@@ -2003,4 +2077,13 @@
HEADER0("propagateCost");
REPORT_SPEEDUP(opt.propagateCost, ref.propagateCost, ibuf1, ushort_test_buff[0], int_test_buff[0], ushort_test_buff[0], int_test_buff[0], double_test_buff[0], 80);
}
+
+ if (opt.findPosLast)
+ {
+ HEADER("findPosLast");
+ coeff_t coefBuf[32 * 32];
+ memset(coefBuf, 0, sizeof(coefBuf));
+ memset(coefBuf + 32 * 31, 1, 32 * sizeof(coeff_t));
+ REPORT_SPEEDUP(opt.findPosLast, ref.findPosLast, g_scanOrder[SCAN_DIAG][NUM_SCAN_SIZE - 1], coefBuf, (uint16_t*)sbuf1, (uint16_t*)sbuf2, (uint8_t*)psbuf1, 32);
+ }
}
diff -r cc579ffe4b7e -r 4747dea7f805 source/test/pixelharness.h
--- a/source/test/pixelharness.h Tue Mar 17 11:11:32 2015 +0530
+++ b/source/test/pixelharness.h Tue Mar 17 16:59:40 2015 -0700
@@ -104,6 +104,7 @@
bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
bool check_calSign(sign_t ref, sign_t opt);
+ bool check_findPosLast(findPosLast_t ref, findPosLast_t opt);
public:
More information about the x265-devel
mailing list