<div dir="ltr"><div>Below are the performance testing on Haswell with and without limiting rect/amp analysis mode in slow preset.</div><div><br></div><div><b>Before</b></div><div>D:\ashok>x265_b.exe --input \\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset slow --hash=1 --no-info --psnr --ssim -o test_b.hevc --bitrate 6000</div><div>encoded 500 frames in 157.90s (3.17 fps), 6060.98 kb/s, Avg QP:40.41, Global PSNR: 29.485, SSIM Mean Y: 0.7780801 ( 6.538 dB)</div><div><br></div><div><b>After</b></div><div>D:\ashok>x265.exe --input \\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset slow --hash=1 --no-info --psnr --ssim -o test.hevc --bitrate 6000 --limit-rect-amp 1</div><div>encoded 500 frames in 148.53s (3.37 fps), 6062.77 kb/s, Avg QP:40.43, Global PSNR: 29.487, SSIM Mean Y: 0.7780540 ( 6.538 dB)</div><div><br></div><div>D:\ashok>x265.exe --input \\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset slow --hash=1 --no-info --psnr --ssim -o test.hevc --bitrate 6000 --limit-refs 1</div><div>encoded 500 frames in 136.84s (3.65 fps), 6061.15 kb/s, Avg QP:40.42, Global PSNR: 29.480, SSIM Mean Y: 0.7778692 ( 6.534 dB)</div><div><br></div><div>D:\ashok>x265.exe --input \\HEVC-TEST-2\testsequences\ducks_take_off_1080p50.y4m --preset slow --hash=1 --no-info --psnr --ssim -o test.hevc --bitrate 6000 --limit-refs 1 --limit-rect-amp 1</div><div>encoded 500 frames in 133.06s (3.76 fps), 6062.52 kb/s, Avg QP:40.43, Global PSNR: 29.481, SSIM Mean Y: 0.7779036 ( 6.535 dB)</div></div><div class="gmail_extra"><br><div class="gmail_quote">On Thu, Oct 15, 2015 at 8:31 PM, <span dir="ltr"><<a href="mailto:ashok@multicorewareinc.com" target="_blank">ashok@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Ashok Kumar Mishra<<a href="mailto:ashok@multicorewareinc.com">ashok@multicorewareinc.com</a>><br>
# Date 1444897694 -19800<br>
# Thu Oct 15 13:58:14 2015 +0530<br>
# Node ID 65d7c1f5baf5fa619d773fcc2e1361d46f6df7f1<br>
# Parent f3963e7e75b8dcb599250c082357e08fd32191a5<br>
analysis: avoid redundant rect/amp mode analysis based on split block rdCost and mvCost for rd-0/4<br>
<br>
diff -r f3963e7e75b8 -r 65d7c1f5baf5 source/encoder/analysis.cpp<br>
--- a/source/encoder/analysis.cpp Wed Oct 14 17:44:33 2015 +0530<br>
+++ b/source/encoder/analysis.cpp Thu Oct 15 13:58:14 2015 +0530<br>
@@ -809,7 +809,7 @@<br>
return refMask;<br>
}<br>
<br>
-uint32_t Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)<br>
+SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)<br>
{<br>
uint32_t depth = cuGeom.depth;<br>
uint32_t cuAddr = parentCTU.m_cuAddr;<br>
@@ -823,7 +823,13 @@<br>
uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);<br>
bool earlyskip = false;<br>
bool splitIntra = true;<br>
- uint32_t splitRefs[4] = { 0, 0, 0, 0 };<br>
+<br>
+ SplitData splitData[4];<br>
+ splitData[0].initSplitCUData();<br>
+ splitData[1].initSplitCUData();<br>
+ splitData[2].initSplitCUData();<br>
+ splitData[3].initSplitCUData();<br>
+<br>
/* Step 1. Evaluate Merge/Skip candidates for likely early-outs */<br>
if (mightNotSplit && depth >= minDepth)<br>
{<br>
@@ -869,7 +875,7 @@<br>
if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)<br>
nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));<br>
<br>
- splitRefs[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);<br>
+ splitData[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);<br>
<br>
// Save best CU and pred data for this sub CU<br>
splitIntra |= nd.bestMode->cu.isIntra(0);<br>
@@ -899,7 +905,7 @@<br>
/* Split CUs<br>
* 0 1<br>
* 2 3 */<br>
- uint32_t allSplitRefs = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];<br>
+ uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;<br>
/* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */<br>
if (mightNotSplit && depth >= minDepth)<br>
{<br>
@@ -917,7 +923,7 @@<br>
{<br>
CUData& cu = md.pred[PRED_2Nx2N].cu;<br>
uint32_t refMask = cu.getBestRefIdx(0);<br>
- allSplitRefs = splitRefs[0] = splitRefs[1] = splitRefs[2] = splitRefs[3] = refMask;<br>
+ allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;<br>
}<br>
<br>
if (m_slice->m_sliceType == B_SLICE)<br>
@@ -929,23 +935,82 @@<br>
Mode *bestInter = &md.pred[PRED_2Nx2N];<br>
if (m_param->bEnableRectInter)<br>
{<br>
- refMasks[0] = splitRefs[0] | splitRefs[2]; /* left */<br>
- refMasks[1] = splitRefs[1] | splitRefs[3]; /* right */<br>
- md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);<br>
- checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);<br>
- if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)<br>
- bestInter = &md.pred[PRED_Nx2N];<br>
-<br>
- refMasks[0] = splitRefs[0] | splitRefs[1]; /* top */<br>
- refMasks[1] = splitRefs[2] | splitRefs[3]; /* bot */<br>
- md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);<br>
- checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);<br>
- if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)<br>
- bestInter = &md.pred[PRED_2NxN];<br>
+ uint64_t splitCost = splitData[0].rdCost + splitData[1].rdCost + splitData[2].rdCost + splitData[3].rdCost;<br>
+ ModeDepth& md = m_modeDepth[depth];<br>
+ uint32_t threshold_2NxN, threshold_Nx2N;<br>
+<br>
+ if (m_slice->m_sliceType == P_SLICE)<br>
+ {<br>
+ threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];<br>
+ threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];<br>
+ }<br>
+ else<br>
+ {<br>
+ threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]<br>
+ + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;<br>
+ threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]<br>
+ + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;<br>
+ }<br>
+<br>
+ int try_2NxN_first = threshold_2NxN < threshold_Nx2N;<br>
+ if (try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)<br>
+ {<br>
+ refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */<br>
+ refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */<br>
+ md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);<br>
+ checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);<br>
+ if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)<br>
+ bestInter = &md.pred[PRED_2NxN];<br>
+ }<br>
+<br>
+ if (splitCost < md.bestMode->rdCost + threshold_Nx2N)<br>
+ {<br>
+ refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */<br>
+ refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */<br>
+ md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);<br>
+ checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);<br>
+ if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)<br>
+ bestInter = &md.pred[PRED_Nx2N];<br>
+ }<br>
+<br>
+ if (!try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)<br>
+ {<br>
+ refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */<br>
+ refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */<br>
+ md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);<br>
+ checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);<br>
+ if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)<br>
+ bestInter = &md.pred[PRED_2NxN];<br>
+ }<br>
}<br>
<br>
if (m_slice->m_sps->maxAMPDepth > depth)<br>
{<br>
+ uint64_t splitCost = splitData[0].rdCost + splitData[1].rdCost + splitData[2].rdCost + splitData[3].rdCost;<br>
+ ModeDepth& md = m_modeDepth[depth];<br>
+ uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;<br>
+<br>
+ if (m_slice->m_sliceType == P_SLICE)<br>
+ {<br>
+ threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];<br>
+ threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];<br>
+<br>
+ threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];<br>
+ threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];<br>
+ }<br>
+ else<br>
+ {<br>
+ threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]<br>
+ + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;<br>
+ threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]<br>
+ + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;<br>
+<br>
+ threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]<br>
+ + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;<br>
+ threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]<br>
+ + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;<br>
+ }<br>
+<br>
bool bHor = false, bVer = false;<br>
if (bestInter->cu.m_partSize[0] == SIZE_2NxN)<br>
bHor = true;<br>
@@ -960,35 +1025,69 @@<br>
<br>
if (bHor)<br>
{<br>
- refMasks[0] = splitRefs[0] | splitRefs[1]; /* 25% top */<br>
- refMasks[1] = allSplitRefs; /* 75% bot */<br>
- md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);<br>
- checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);<br>
- if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)<br>
- bestInter = &md.pred[PRED_2NxnU];<br>
-<br>
- refMasks[0] = allSplitRefs; /* 75% top */<br>
- refMasks[1] = splitRefs[2] | splitRefs[3]; /* 25% bot */<br>
- md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);<br>
- checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);<br>
- if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)<br>
- bestInter = &md.pred[PRED_2NxnD];<br>
+ int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;<br>
+ if (try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)<br>
+ {<br>
+ refMasks[0] = allSplitRefs; /* 75% top */<br>
+ refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */<br>
+ md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);<br>
+ checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);<br>
+ if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)<br>
+ bestInter = &md.pred[PRED_2NxnD];<br>
+ }<br>
+<br>
+ if (splitCost < md.bestMode->rdCost + threshold_2NxnU)<br>
+ {<br>
+ refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */<br>
+ refMasks[1] = allSplitRefs; /* 75% bot */<br>
+ md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);<br>
+ checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);<br>
+ if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)<br>
+ bestInter = &md.pred[PRED_2NxnU];<br>
+ }<br>
+<br>
+ if (!try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)<br>
+ {<br>
+ refMasks[0] = allSplitRefs; /* 75% top */<br>
+ refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */<br>
+ md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);<br>
+ checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);<br>
+ if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)<br>
+ bestInter = &md.pred[PRED_2NxnD];<br>
+ }<br>
}<br>
if (bVer)<br>
{<br>
- refMasks[0] = splitRefs[0] | splitRefs[2]; /* 25% left */<br>
- refMasks[1] = allSplitRefs; /* 75% right */<br>
- md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);<br>
- checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);<br>
- if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)<br>
- bestInter = &md.pred[PRED_nLx2N];<br>
-<br>
- refMasks[0] = allSplitRefs; /* 75% left */<br>
- refMasks[1] = splitRefs[1] | splitRefs[3]; /* 25% right */<br>
- md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);<br>
- checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);<br>
- if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)<br>
- bestInter = &md.pred[PRED_nRx2N];<br>
+ int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;<br>
+ if (try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)<br>
+ {<br>
+ refMasks[0] = allSplitRefs; /* 75% left */<br>
+ refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */<br>
+ md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);<br>
+ checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);<br>
+ if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)<br>
+ bestInter = &md.pred[PRED_nRx2N];<br>
+ }<br>
+<br>
+ if (splitCost < md.bestMode->rdCost + threshold_nLx2N)<br>
+ {<br>
+ refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left */<br>
+ refMasks[1] = allSplitRefs; /* 75% right */<br>
+ md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);<br>
+ checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);<br>
+ if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)<br>
+ bestInter = &md.pred[PRED_nLx2N];<br>
+ }<br>
+<br>
+ if (!try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)<br>
+ {<br>
+ refMasks[0] = allSplitRefs; /* 75% left */<br>
+ refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */<br>
+ md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);<br>
+ checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);<br>
+ if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)<br>
+ bestInter = &md.pred[PRED_nRx2N];<br>
+ }<br>
}<br>
}<br>
bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;<br>
@@ -1139,19 +1238,32 @@<br>
}<br>
<br>
/* determine which motion references the parent CU should search */<br>
- uint32_t refMask;<br>
- if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))<br>
- refMask = 0;<br>
- else if (md.bestMode == &md.pred[PRED_SPLIT])<br>
- refMask = allSplitRefs;<br>
- else<br>
- {<br>
- /* use best merge/inter mode, in case of intra use 2Nx2N inter references */<br>
- CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;<br>
- uint32_t numPU = cu.getNumPartInter(0);<br>
- refMask = 0;<br>
- for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))<br>
- refMask |= cu.getBestRefIdx(subPartIdx);<br>
+ SplitData splitCUData;<br>
+ if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))<br>
+ splitCUData.splitRefs = 0;<br>
+ else if (md.bestMode == &md.pred[PRED_SPLIT])<br>
+ splitCUData.splitRefs = allSplitRefs;<br>
+ else<br>
+ {<br>
+ /* use best merge/inter mode, in case of intra use 2Nx2N inter references */<br>
+ CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;<br>
+ uint32_t numPU = cu.getNumPartInter(0);<br>
+ splitCUData.splitRefs = 0;<br>
+ for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))<br>
+ splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);<br>
+ }<br>
+<br>
+ if (!m_param->limitRectAmp)<br>
+ {<br>
+ splitCUData.mvCost[0] = 0; // L0<br>
+ splitCUData.mvCost[1] = 0; // L1<br>
+ splitCUData.rdCost = 0;<br>
+ }<br>
+ else<br>
+ {<br>
+ splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0<br>
+ splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1<br>
+ splitCUData.rdCost = md.bestMode->rdCost;<br>
}<br>
<br>
if (mightNotSplit)<br>
@@ -1169,7 +1281,7 @@<br>
if (m_param->rdLevel)<br>
md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);<br>
<br>
- return refMask;<br>
+ return splitCUData;<br>
}<br>
<br>
SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp)<br>
diff -r f3963e7e75b8 -r 65d7c1f5baf5 source/encoder/analysis.h<br>
--- a/source/encoder/analysis.h Wed Oct 14 17:44:33 2015 +0530<br>
+++ b/source/encoder/analysis.h Thu Oct 15 13:58:14 2015 +0530<br>
@@ -131,7 +131,7 @@<br>
<br>
/* full analysis for a P or B slice CU */<br>
uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);<br>
- uint32_t compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);<br>
+ SplitData compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);<br>
SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);<br>
<br>
/* measure merge and skip */<br>
</blockquote></div><br></div>