[x265] [PATCH 115 of 307] x86: Aligned routine encoder integration for blockfill_s primitive

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:53 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1507182370 -19800
#      Thu Oct 05 11:16:10 2017 +0530
# Node ID d4ee703039c6cde39312a596cee019c346a8381b
# Parent  14c93ddbd598128b43a96ff21221e2dbb189d275
x86: Aligned routine encoder integration for blockfill_s primitive

diff -r 14c93ddbd598 -r d4ee703039c6 source/common/quant.cpp
--- a/source/common/quant.cpp	Wed Oct 04 15:55:03 2017 +0530
+++ b/source/common/quant.cpp	Thu Oct 05 11:16:10 2017 +0530
@@ -188,8 +188,9 @@
     m_nr           = NULL;
 }
 
-bool Quant::init(double psyScale, const ScalingList& scalingList, Entropy& entropy)
+bool Quant::init(double psyScale, const ScalingList& scalingList, Entropy& entropy, int cpuid)
 {
+    m_cpuid = cpuid;
     m_entropyCoder = &entropy;
     m_psyRdoqScale = (int32_t)(psyScale * 256.0);
     X265_CHECK((psyScale * 256.0) < (double)MAX_INT, "psyScale value too large\n");
@@ -611,7 +612,10 @@
             const int add_2nd = 1 << (shift_2nd - 1);
 
             int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
-            primitives.cu[sizeIdx].blockfill_s(residual, resiStride, (int16_t)dc_val);
+            if ((resiStride % 64 == 0) && (m_cpuid & X265_CPU_AVX512))
+                primitives.cu[sizeIdx].blockfill_s_aligned(residual, resiStride, (int16_t)dc_val);
+            else
+                primitives.cu[sizeIdx].blockfill_s(residual, resiStride, (int16_t)dc_val);
             return;
         }
 
diff -r 14c93ddbd598 -r d4ee703039c6 source/common/quant.h
--- a/source/common/quant.h	Wed Oct 04 15:55:03 2017 +0530
+++ b/source/common/quant.h	Thu Oct 05 11:16:10 2017 +0530
@@ -93,6 +93,7 @@
 
 public:
 
+    int m_cpuid;
     NoiseReduction*    m_nr;
     NoiseReduction*    m_frameNr; // Array of NR structures, one for each frameEncoder
 
@@ -100,7 +101,7 @@
     ~Quant();
 
     /* one-time setup */
-    bool init(double psyScale, const ScalingList& scalingList, Entropy& entropy);
+    bool init(double psyScale, const ScalingList& scalingList, Entropy& entropy, int cpuid);
     bool allocNoiseReduction(const x265_param& param);
 
     /* CU setup */
diff -r 14c93ddbd598 -r d4ee703039c6 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Wed Oct 04 15:55:03 2017 +0530
+++ b/source/encoder/search.cpp	Thu Oct 05 11:16:10 2017 +0530
@@ -81,7 +81,7 @@
     m_rdCost.setSsimRd(param.bSsimRd);
     m_me.init(param.internalCsp);
 
-    bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
+    bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder, param.cpuid);
     if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
         ok &= m_quant.allocNoiseReduction(param);
 
@@ -2914,7 +2914,10 @@
         }
         else
         {
-            primitives.cu[sizeIdx].blockfill_s(curResiY, strideResiY, 0);
+            if ((strideResiY % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+                primitives.cu[sizeIdx].blockfill_s_aligned(curResiY, strideResiY, 0);
+            else
+                primitives.cu[sizeIdx].blockfill_s(curResiY, strideResiY, 0);
             cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
         }
 
@@ -2947,7 +2950,10 @@
                 }
                 else
                 {
-                    primitives.cu[sizeIdxC].blockfill_s(curResiU, strideResiC, 0);
+                    if ((strideResiC % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+                        primitives.cu[sizeIdxC].blockfill_s_aligned(curResiU, strideResiC, 0);
+                    else
+                        primitives.cu[sizeIdxC].blockfill_s(curResiU, strideResiC, 0);
                     cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
                 }
 
@@ -2961,7 +2967,11 @@
                 }
                 else
                 {
-                    primitives.cu[sizeIdxC].blockfill_s(curResiV, strideResiC, 0);
+                    if ((strideResiC % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+                        primitives.cu[sizeIdxC].blockfill_s_aligned(curResiV, strideResiC, 0);
+                    else
+                        primitives.cu[sizeIdxC].blockfill_s(curResiV, strideResiC, 0);
+
                     cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
                 }
             }
@@ -3229,7 +3239,10 @@
                 {
                     cbfFlag[TEXT_LUMA][0] = 0;
                     singleBits[TEXT_LUMA][0] = 0;
-                    primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
+                    if ((strideResiY % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+                        primitives.cu[partSize].blockfill_s_aligned(curResiY, strideResiY, 0);
+                    else
+                        primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
 #if CHECKED_BUILD || _DEBUG
                     uint32_t numCoeffY = 1 << (log2TrSize << 1);
                     memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY);
@@ -3252,7 +3265,10 @@
         {
             if (checkTransformSkipY)
                 minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
-            primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
+            if ((strideResiY % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+                primitives.cu[partSize].blockfill_s_aligned(curResiY, strideResiY, 0);
+            else
+                primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
             singleDist[TEXT_LUMA][0] = zeroDistY;
             singleBits[TEXT_LUMA][0] = 0;
             singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
@@ -3341,7 +3357,10 @@
                             {
                                 cbfFlag[chromaId][tuIterator.section] = 0;
                                 singleBits[chromaId][tuIterator.section] = 0;
-                                primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
+                                if ((strideResiC % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+                                    primitives.cu[partSizeC].blockfill_s_aligned(curResiC, strideResiC, 0);
+                                else
+                                    primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
 #if CHECKED_BUILD || _DEBUG
                                 uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
                                 memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
@@ -3364,7 +3383,10 @@
                     {
                         if (checkTransformSkipC)
                             minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
-                        primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
+                        if ((strideResiC % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
+                            primitives.cu[partSizeC].blockfill_s_aligned(curResiC, strideResiC, 0);
+                        else
+                            primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
                         singleBits[chromaId][tuIterator.section] = 0;
                         singleDist[chromaId][tuIterator.section] = zeroDistC;
                         singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;


More information about the x265-devel mailing list