[x265] [PATCH 117 of 307] x86: Aligned routine encoder integration for calcresidual primitive

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:55 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1507182997 -19800
#      Thu Oct 05 11:26:37 2017 +0530
# Node ID 1748c9a5c9b16c380f926cd5d07a69c4f13a6fab
# Parent  c497cbf5c2d53ea9c47f3929eaacbb36e703bdfa
x86: Aligned routine encoder integration for calcresidual primitive

diff -r c497cbf5c2d5 -r 1748c9a5c9b1 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Wed Oct 04 16:33:33 2017 +0530
+++ b/source/encoder/search.cpp	Thu Oct 05 11:26:37 2017 +0530
@@ -354,8 +354,10 @@
         // store original entropy coding status
         if (bEnableRDOQ)
             m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
-
-        primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
+        if ((stride % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+            primitives.cu[sizeIdx].calcresidual_aligned(fenc, pred, residual, stride);
+        else
+            primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
 
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
         if (numSig)
@@ -561,7 +563,10 @@
         pixel*   tmpRecon = (useTSkip ? m_tsRecon : reconQt);
         uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
 
-        primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
+        if ((stride % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+            primitives.cu[sizeIdx].calcresidual_aligned(fenc, pred, residual, stride);
+        else
+            primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
 
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
         if (numSig)
@@ -714,7 +719,10 @@
         coeff_t* coeffY       = cu.m_trCoeff[0] + coeffOffsetY;
 
         uint32_t sizeIdx   = log2TrSize - 2;
-        primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
+        if ((stride % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+            primitives.cu[sizeIdx].calcresidual_aligned(fenc, pred, residual, stride);
+        else
+            primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
 
         PicYuv*  reconPic = m_frame->m_reconPic;
         pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
@@ -893,7 +901,11 @@
             predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
             cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 
-            primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
+            if ((stride % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+                primitives.cu[sizeIdxC].calcresidual_aligned(fenc, pred, residual, stride);
+            else
+                primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
+
             uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
             if (numSig)
             {
@@ -992,7 +1004,10 @@
                 pixel*   recon = (useTSkip ? m_tsRecon : reconQt);
                 uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
 
-                primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
+                if ((stride % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+                    primitives.cu[sizeIdxC].calcresidual_aligned(fenc, pred, residual, stride);
+                else
+                    primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
 
                 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
                 if (numSig)
@@ -1183,7 +1198,11 @@
 
             X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
 
-            primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
+            if ((stride % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+                primitives.cu[sizeIdxC].calcresidual_aligned(fenc, pred, residual, stride);
+            else
+                primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
+
             uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
             if (numSig)
             {


More information about the x265-devel mailing list