[x265] [PATCH 277 of 307] x86: psyRdoQuant primitive

Sat Apr 7 04:34:35 CEST 2018

# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1514521347 -19800
#      Fri Dec 29 09:52:27 2017 +0530
# Node ID 4e9f2efdfd097910aa5bf704a4bbf38b0a28f2a5
# Parent  80775bda5ec16735e7b1de97dedeb7f7ed391c8f
x86: psyRdoQuant primitive

This patch also adds AVX512 assembly code for this primitive
AVX512 :231.20c
C code :1060.74c

diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/dct.cpp

--- a/source/common/dct.cpp	Tue Jan 02 15:21:08 2018 +0530
+++ b/source/common/dct.cpp	Fri Dec 29 09:52:27 2017 +0530
@@ -1001,9 +1001,34 @@
         blkPos += trSize;
     }
 }
+template<int log2TrSize>
+static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
+{
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
+    const uint32_t trSize = 1 << log2TrSize;
+    int max = X265_MAX(0, (2 * transformShift + 1));
+
+    for (int y = 0; y < MLS_CG_SIZE; y++)
+    {
+        for (int x = 0; x < MLS_CG_SIZE; x++)
+        {
+            int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
+            int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
+
+            costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
+
+            /* when no residual coefficient is coded, predicted coef == recon coef */
+            costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
+
+            *totalUncodedCost += costUncoded[blkPos + x];
+            *totalRdCost += costUncoded[blkPos + x];
+        }
+        blkPos += trSize;
+    }
+}
 namespace X265_NS {
 // x265 private namespace
-
 void setupDCTPrimitives_c(EncoderPrimitives& p)
 {
     p.dequant_scaling = dequant_scaling_c;
@@ -1014,6 +1039,10 @@
     p.cu[BLOCK_8x8].nonPsyRdoQuant   = nonPsyRdoQuant_c<3>;
     p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>;
     p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>;
+    p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_c<2>;
+    p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_c<3>;
+    p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_c<4>;
+    p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_c<5>;
     p.dst4x4 = dst4_c;
     p.cu[BLOCK_4x4].dct   = dct4_c;
     p.cu[BLOCK_8x8].dct   = dct8_c;
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/primitives.h
--- a/source/common/primitives.h	Tue Jan 02 15:21:08 2018 +0530
+++ b/source/common/primitives.h	Fri Dec 29 09:52:27 2017 +0530
@@ -224,6 +224,8 @@
 typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
 typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
 typedef void(*nonPsyRdoQuant_t)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
+typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
+
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -297,6 +299,7 @@
         intra_filter_t  intra_filter;
         intra_pred_t    intra_pred[NUM_INTRA_MODE];
         nonPsyRdoQuant_t nonPsyRdoQuant;
+        psyRdoQuant_t    psyRdoQuant;
     }
     cu[NUM_CU_SIZES];
     /* These remaining primitives work on either fixed block sizes or take
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/quant.cpp
--- a/source/common/quant.cpp	Tue Jan 02 15:21:08 2018 +0530
+++ b/source/common/quant.cpp	Fri Dec 29 09:52:27 2017 +0530
@@ -642,11 +642,9 @@
     X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
     if (!numSig)
         return 0;
-
     const uint32_t trSize = 1 << log2TrSize;
     int64_t lambda2 = m_qpParam[ttype].lambda2;
-    const int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
-
+    int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
     /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
      * scale applied that must be removed during unquant. Note that in real dequant there is clipping
      * at several stages. We skip the clipping for simplicity when measuring RD cost */
@@ -723,25 +721,9 @@
         for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
         {
             X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
-
             uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
             uint32_t blkPos      = codeParams.scan[scanPosBase];
-
-            // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
-            for (int y = 0; y < MLS_CG_SIZE; y++)
-            {
-                for (int x = 0; x < MLS_CG_SIZE; x++)
-                {
-                    int signCoef         = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
-                    int predictedCoef    = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
-                    costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
-                    /* when no residual coefficient is coded, predicted coef == recon coef */
-                    costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
-                    totalUncodedCost += costUncoded[blkPos + x];
-                    totalRdCost += costUncoded[blkPos + x];
-                }
-                blkPos += trSize;
-            }
+            primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
         }
     }
     else
@@ -814,22 +796,14 @@
             // TODO: does we need zero-coeff cost?
             const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
             uint32_t blkPos = codeParams.scan[scanPosBase];
-
             if (usePsyMask)
             {
-                // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
+                primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
+                blkPos = codeParams.scan[scanPosBase];
                 for (int y = 0; y < MLS_CG_SIZE; y++)
                 {
                     for (int x = 0; x < MLS_CG_SIZE; x++)
                     {
-                        int signCoef         = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
-                        int predictedCoef    = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
-                        costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
-                        /* when no residual coefficient is coded, predicted coef == recon coef */
-                        costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
-                        totalUncodedCost += costUncoded[blkPos + x];
-                        totalRdCost += costUncoded[blkPos + x];
-
                         const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
                         const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
                         X265_CHECK(trSize > 4, "trSize check failure\n");
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jan 02 15:21:08 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Dec 29 09:52:27 2017 +0530
@@ -3120,7 +3120,10 @@
         p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx512);
         p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx512);
         p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx512);
-
+        p.cu[BLOCK_4x4].psyRdoQuant = PFX(psyRdoQuant4_avx512);
+        p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512);
+        p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512);
+        p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512);
     }
 #endif
 }
@@ -5302,10 +5305,16 @@
         p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x32>;
         p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>;
         p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>;
+
         p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx512);
         p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx512);
         p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx512);
         p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx512);
+        p.cu[BLOCK_4x4].psyRdoQuant = PFX(psyRdoQuant4_avx512);
+        p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512);
+        p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512);
+        p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512);
+
     }
 #endif
 }
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Tue Jan 02 15:21:08 2018 +0530
+++ b/source/common/x86/dct8.asm	Fri Dec 29 09:52:27 2017 +0530
@@ -516,6 +516,7 @@
 tab_nonpsyRdo8 : dq 5, 7, 9, 11
 tab_nonpsyRdo10: dq 9, 11, 13, 15
 tab_nonpsyRdo12: dq 13, 15, 17, 19
+
 SECTION .text
 cextern pd_1
 cextern pd_2
@@ -542,6 +543,10 @@
     %define     DST4_ROUND          16
     %define     DCT8_SHIFT1         6
     %define     DCT8_ROUND1         32
+    %define     RDO_MAX_4           3
+    %define     RDO_MAX_8           1
+    %define     RDO_MAX_16          0
+    %define     RDO_MAX_32          0
 %elif BIT_DEPTH == 10
     %define     DCT4_SHIFT          3
     %define     DCT4_ROUND          4
@@ -551,6 +556,10 @@
     %define     DST4_ROUND          4
     %define     DCT8_SHIFT1         4
     %define     DCT8_ROUND1         8
+    %define     RDO_MAX_4           7
+    %define     RDO_MAX_8           5
+    %define     RDO_MAX_16          3
+    %define     RDO_MAX_32          1
 %elif BIT_DEPTH == 8
     %define     DCT4_SHIFT          1
     %define     DCT4_ROUND          1
@@ -560,6 +569,10 @@
     %define     DST4_ROUND          1
     %define     DCT8_SHIFT1         2
     %define     DCT8_ROUND1         2
+    %define     RDO_MAX_4           11
+    %define     RDO_MAX_8           9
+    %define     RDO_MAX_16          7
+    %define     RDO_MAX_32          5
 %else
     %error Unsupported BIT_DEPTH!
 %endif
@@ -6650,5 +6663,391 @@
     movq           [r2],       xm6
     movq           [r3],       xm7
     RET
-
+;static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t psyScale, uint32_t blkPos)
+;{
+;    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+;    const int scaleBits = SCALE_BITS - 2 * transformShift;
+;    const uint32_t trSize = 1 << log2TrSize;
+;    int max = X265_MAX(0, (2 * transformShift + 1));
+;
+;    for (int y = 0; y < MLS_CG_SIZE; y++)
+;    {
+;        for (int x = 0; x < MLS_CG_SIZE; x++)
+;        {
+;            int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
+;            int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
+;
+;            costUncoded[blkPos + x] = static_cast<int64_t>((double)(signCoef * signCoef) << scaleBits);
+;
+;            /* when no residual coefficient is coded, predicted coef == recon coef */
+;            costUncoded[blkPos + x] -= static_cast<int64_t>((psyScale * (predictedCoef)) >> max);
+;
+;            *totalUncodedCost += costUncoded[blkPos + x];
+;            *totalRdCost += costUncoded[blkPos + x];
+;        }
+;        blkPos += trSize;
+;    }
+;}
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal psyRdoQuant4, 5, 9, 13
+    mov             r5,        r5m
+    mov            r6d,        r6m
+    vpbroadcastq   m12,        [r5]                              ; psyScale
+    lea             r0,        [r0 + 2 * r6]
+    lea             r1,        [r1 + 2 * r6]
+    lea             r6,        [4 * r6]
+    lea             r2,        [r2 + 2 * r6]
+    movq           xm0,        [r3]
+    movq           xm1,        [r4]
+
+%if BIT_DEPTH == 12
+    mov            r5,         [tab_nonpsyRdo12]                 ; scaleBits
+%elif BIT_DEPTH == 10
+    mov            r5,         [tab_nonpsyRdo10]
+%elif BIT_DEPTH == 8
+    mov            r5,         [tab_nonpsyRdo8]
+%else
+    %error Unsupported BIT_DEPTH!
 %endif
+
+    movq           xm2,        r5
+    vpxor           m4,        m4
+    vpxor           m3,        m3
+
+;Row 1, 2
+    vpmovsxwq       m6,        [r0]
+    vpmovsxwq       m7,        [r1]
+    psubq           m7,        m6                              ; predictedCoef
+
+    vcvtqq2pd       m9,        m6
+    vfmadd213pd     m9,        m9,             m3
+    vcvtpd2qq       m8,        m9
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
+
+    vcvtqq2pd      m10,        m7
+    vcvtqq2pd      m11,        m12
+    vfmadd213pd    m10,        m11,            m3
+    vcvtpd2qq       m9,        m10
+    vpsraq          m9,        RDO_MAX_4                       ;(psyScale * predictedCoef) >> max
+
+    psubq           m8,        m9
+    paddq           m4,        m8
+    movu           [r2],       m8
+
+    ;Row 3, 4
+    vpmovsxwq       m6,        [r0 + 16]
+    vpmovsxwq       m7,        [r1 + 16]
+    psubq           m7,        m6                              ; predictedCoef
+
+    vcvtqq2pd       m9,        m6
+    vfmadd213pd     m9,        m9,             m3
+    vcvtpd2qq       m8,        m9
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
+
+    vcvtqq2pd      m10,        m7
+    vcvtqq2pd      m11,        m12
+    vfmadd213pd    m10,        m11,             m3
+    vcvtpd2qq       m9,        m10
+    vpsraq          m9,        RDO_MAX_4                      ;(psyScale * predictedCoef) >> max
+
+    psubq           m8,         m9
+    paddq           m4,         m8
+    movu           [r2 + 64],   m8
+
+    vextracti32x8  ym2,         m4,            1
+    paddq          ym4,        ym2
+    vextracti32x4  xm2,         m4,            1
+    paddq          xm4,        xm2
+    punpckhqdq     xm2,        xm4,            xm3
+    paddq          xm4,        xm2
+
+    paddq          xm0,        xm4
+    paddq          xm1,        xm4
+
+    movq           [r3],       xm0
+    movq           [r4],       xm1
+    RET
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal psyRdoQuant8, 5, 9, 15
+    mov             r5,        r5m
+    mov            r6d,        r6m
+    vpbroadcastq   m12,        [r5]                              ; psyScale
+    lea             r0,        [r0 + 2 * r6]
+    lea             r1,        [r1 + 2 * r6]
+    lea             r6,        [4 * r6]
+    lea             r2,        [r2 + 2 * r6]
+    movq           xm0,        [r3]
+    movq           xm1,        [r4]
+
+%if BIT_DEPTH == 12
+    mov            r5,         [tab_nonpsyRdo12 + 8]                 ; scaleBits
+%elif BIT_DEPTH == 10
+    mov            r5,         [tab_nonpsyRdo10 + 8]
+%elif BIT_DEPTH == 8
+    mov            r5,         [tab_nonpsyRdo8 + 8]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    movq           xm2,        r5
+    vpxor           m4,        m4
+    vpxor           m3,        m3
+
+;Row 1, 2
+    movq           xm13,       [r0]
+    movq           xm14,       [r1]
+    pinsrq         xm13,       [r0 + mmsize/4], 1
+    pinsrq         xm14,       [r1 + mmsize/4], 1
+    vpmovsxwq       m6,        xm13
+    vpmovsxwq       m7,        xm14
+    psubq           m7,        m6                              ; predictedCoef
+
+    vcvtqq2pd       m9,        m6
+    vfmadd213pd     m9,        m9,             m3
+    vcvtpd2qq       m8,        m9
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
+
+    vcvtqq2pd      m10,        m7
+    vcvtqq2pd      m11,        m12
+    vfmadd213pd    m10,        m11,            m3
+    vcvtpd2qq       m9,        m10
+    vpsraq          m9,        RDO_MAX_8                       ;(psyScale * predictedCoef) >> max
+
+    psubq           m8,        m9
+    paddq           m4,        m8
+    movu           [r2],       ym8
+    vextracti32x8  [r2 + mmsize],  m8 ,        1
+
+    ;Row 3, 4
+    movq           xm13,       [r0 + mmsize/2]
+    movq           xm14,       [r1 + mmsize/2]
+    pinsrq         xm13,       [r0 + 3 * mmsize/4],      1
+    pinsrq         xm14,       [r1 + 3 * mmsize/4],      1
+    vpmovsxwq       m6,        xm13
+    vpmovsxwq       m7,        xm14
+    psubq           m7,        m6                              ; predictedCoef
+
+    vcvtqq2pd       m9,        m6
+    vfmadd213pd     m9,        m9,             m3
+    vcvtpd2qq       m8,        m9
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
+
+    vcvtqq2pd      m10,        m7
+    vcvtqq2pd      m11,        m12
+    vfmadd213pd    m10,        m11,             m3
+    vcvtpd2qq       m9,        m10
+    vpsraq          m9,        RDO_MAX_8                      ;(psyScale * predictedCoef) >> max
+
+    psubq           m8,         m9
+    paddq           m4,         m8
+    movu           [r2 + 2 * mmsize],       ym8
+    vextracti32x8  [r2 + 3 * mmsize],  m8 ,    1
+
+    vextracti32x8  ym2,         m4,            1
+    paddq          ym4,        ym2
+    vextracti32x4  xm2,         m4,            1
+    paddq          xm4,        xm2
+    punpckhqdq     xm2,        xm4,            xm3
+    paddq          xm4,        xm2
+
+    paddq          xm0,        xm4
+    paddq          xm1,        xm4
+
+    movq           [r3],       xm0
+    movq           [r4],       xm1
+    RET
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal psyRdoQuant16, 5, 9, 15
+    mov             r5,        r5m
+    mov            r6d,        r6m
+    vpbroadcastq   m12,        [r5]                              ; psyScale
+    lea             r0,        [r0 + 2 * r6]
+    lea             r1,        [r1 + 2 * r6]
+    lea             r6,        [4 * r6]
+    lea             r2,        [r2 + 2 * r6]
+    movq           xm0,        [r3]
+    movq           xm1,        [r4]
+
+%if BIT_DEPTH == 12
+    mov            r5,         [tab_nonpsyRdo12 + 16]                 ; scaleBits
+%elif BIT_DEPTH == 10
+    mov            r5,         [tab_nonpsyRdo10 + 16]
+%elif BIT_DEPTH == 8
+    mov            r5,         [tab_nonpsyRdo8 + 16]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    movq           xm2,        r5
+    vpxor           m4,        m4
+    vpxor           m3,        m3
+
+;Row 1, 2
+    movq           xm13,       [r0]
+    movq           xm14,       [r1]
+    pinsrq         xm13,       [r0 + mmsize/2], 1
+    pinsrq         xm14,       [r1 + mmsize/2], 1
+    vpmovsxwq       m6,        xm13
+    vpmovsxwq       m7,        xm14
+    psubq           m7,        m6                              ; predictedCoef
+
+    vcvtqq2pd       m9,        m6
+    vfmadd213pd     m9,        m9,             m3
+    vcvtpd2qq       m8,        m9
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
+
+    vcvtqq2pd      m10,        m7
+    vcvtqq2pd      m11,        m12
+    vfmadd213pd    m10,        m11,            m3
+    vcvtpd2qq       m9,        m10
+    vpsraq          m9,        RDO_MAX_16                      ;(psyScale * predictedCoef) >> max
+
+    psubq           m8,        m9
+    paddq           m4,        m8
+    movu           [r2],       ym8
+    vextracti32x8  [r2 + 2 * mmsize],  m8 ,        1
+
+    ;Row 3, 4
+    movq           xm13,       [r0 + mmsize]
+    movq           xm14,       [r1 + mmsize]
+    pinsrq         xm13,       [r0 + 3 * mmsize/2],      1
+    pinsrq         xm14,       [r1 + 3 * mmsize/2],      1
+    vpmovsxwq       m6,        xm13
+    vpmovsxwq       m7,        xm14
+    psubq           m7,        m6                              ; predictedCoef
+
+    vcvtqq2pd       m9,        m6
+    vfmadd213pd     m9,        m9,             m3
+    vcvtpd2qq       m8,        m9
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
+
+    vcvtqq2pd      m10,        m7
+    vcvtqq2pd      m11,        m12
+    vfmadd213pd    m10,        m11,             m3
+    vcvtpd2qq       m9,        m10
+    vpsraq          m9,        RDO_MAX_16                      ;(psyScale * predictedCoef) >> max
+
+    psubq           m8,         m9
+    paddq           m4,         m8
+    movu           [r2 + 4 * mmsize],       ym8
+    vextracti32x8  [r2 + 6 * mmsize],  m8 ,    1
+
+    vextracti32x8  ym2,         m4,            1
+    paddq          ym4,        ym2
+    vextracti32x4  xm2,         m4,            1
+    paddq          xm4,        xm2
+    punpckhqdq     xm2,        xm4,            xm3
+    paddq          xm4,        xm2
+
+    paddq          xm0,        xm4
+    paddq          xm1,        xm4
+
+    movq           [r3],       xm0
+    movq           [r4],       xm1
+    RET
+
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal psyRdoQuant32, 5, 9, 15
+    mov             r5,        r5m
+    mov            r6d,        r6m
+    vpbroadcastq   m12,        [r5]                              ; psyScale
+    lea             r0,        [r0 + 2 * r6]
+    lea             r1,        [r1 + 2 * r6]
+    lea             r6,        [4 * r6]
+    lea             r2,        [r2 + 2 * r6]
+    movq           xm0,        [r3]
+    movq           xm1,        [r4]
+
+%if BIT_DEPTH == 12
+    mov            r5,         [tab_nonpsyRdo12 + 24]                 ; scaleBits
+%elif BIT_DEPTH == 10
+    mov            r5,         [tab_nonpsyRdo10 + 24]
+%elif BIT_DEPTH == 8
+    mov            r5,         [tab_nonpsyRdo8 + 24]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    movq           xm2,        r5
+    vpxor           m4,        m4
+    vpxor           m3,        m3
+
+;Row 1, 2
+    movq           xm13,       [r0]
+    movq           xm14,       [r1]
+    pinsrq         xm13,       [r0 + mmsize], 1
+    pinsrq         xm14,       [r1 + mmsize], 1
+    vpmovsxwq       m6,        xm13
+    vpmovsxwq       m7,        xm14
+    psubq           m7,        m6                              ; predictedCoef
+
+    vcvtqq2pd       m9,        m6
+    vfmadd213pd     m9,        m9,             m3
+    vcvtpd2qq       m8,        m9
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
+
+    vcvtqq2pd      m10,        m7
+    vcvtqq2pd      m11,        m12
+    vfmadd213pd    m10,        m11,            m3
+    vcvtpd2qq       m9,        m10
+    vpsraq          m9,        RDO_MAX_32                      ;(psyScale * predictedCoef) >> max
+
+    psubq           m8,        m9
+    paddq           m4,        m8
+    movu           [r2],       ym8
+    vextracti32x8  [r2 + 4 * mmsize],  m8 ,        1
+
+    ;Row 3, 4
+    movq           xm13,       [r0 + 2 * mmsize]
+    movq           xm14,       [r1 + 2 * mmsize]
+    pinsrq         xm13,       [r0 + 3 * mmsize],      1
+    pinsrq         xm14,       [r1 + 3 * mmsize],      1
+    vpmovsxwq       m6,        xm13
+    vpmovsxwq       m7,        xm14
+    psubq           m7,        m6                              ; predictedCoef
+
+    vcvtqq2pd       m9,        m6
+    vfmadd213pd     m9,        m9,             m3
+    vcvtpd2qq       m8,        m9
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
+
+    vcvtqq2pd      m10,        m7
+    vcvtqq2pd      m11,        m12
+    vfmadd213pd    m10,        m11,             m3
+    vcvtpd2qq       m9,        m10
+    vpsraq          m9,        RDO_MAX_32                      ;(psyScale * predictedCoef) >> max
+
+    psubq           m8,         m9
+    paddq           m4,         m8
+    movu           [r2 + 8 * mmsize],       ym8
+    vextracti32x8  [r2 + 12 * mmsize], m8 ,    1
+
+    vextracti32x8  ym2,         m4,            1
+    paddq          ym4,        ym2
+    vextracti32x4  xm2,         m4,            1
+    paddq          xm4,        xm2
+    punpckhqdq     xm2,        xm4,            xm3
+    paddq          xm4,        xm2
+
+    paddq          xm0,        xm4
+    paddq          xm1,        xm4
+
+    movq           [r3],       xm0
+    movq           [r4],       xm1
+    RET
+%endif
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Tue Jan 02 15:21:08 2018 +0530
+++ b/source/common/x86/dct8.h	Fri Dec 29 09:52:27 2017 +0530
@@ -35,6 +35,8 @@
 FUNCDEF_TU_S2(void, idct, sse4, const int16_t* src, int16_t* dst, intptr_t dstStride);
 FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride);
 FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx512, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
+FUNCDEF_TU_S2(void, psyRdoQuant, avx512, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos);
+
 void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void PFX(idst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp	Tue Jan 02 15:21:08 2018 +0530
+++ b/source/test/mbdstharness.cpp	Fri Dec 29 09:52:27 2017 +0530
@@ -61,16 +61,17 @@
     for (int i = 0; i < TEST_BUF_SIZE; i++)
     {
         short_test_buff[0][i]    = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
+        short_test_buff1[0][i]   = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
         int_test_buff[0][i]      = rand() % PIXEL_MAX;
         int_idct_test_buff[0][i] = (rand() % (SHORT_MAX - SHORT_MIN)) - SHORT_MAX;
         short_denoise_test_buff1[0][i] = short_denoise_test_buff2[0][i] = (rand() & SHORT_MAX) - (rand() & SHORT_MAX);
-
         short_test_buff[1][i]    = -PIXEL_MAX;
+        short_test_buff1[1][i]   = -PIXEL_MAX;
         int_test_buff[1][i]      = -PIXEL_MAX;
         int_idct_test_buff[1][i] = SHORT_MIN;
         short_denoise_test_buff1[1][i] = short_denoise_test_buff2[1][i] = -SHORT_MAX;
-
         short_test_buff[2][i]    = PIXEL_MAX;
+        short_test_buff1[2][i]   = PIXEL_MAX;
         int_test_buff[2][i]      = PIXEL_MAX;
         int_idct_test_buff[2][i] = SHORT_MAX;
         short_denoise_test_buff1[2][i] = short_denoise_test_buff2[2][i] = SHORT_MAX;
@@ -324,6 +325,51 @@
 
     return true;
 }
+bool MBDstHarness::check_psyRdoQuant_primitive(psyRdoQuant_t ref, psyRdoQuant_t opt)
+{
+    int j = 0;
+    int trSize[4] = { 16, 64, 256, 1024 };
+
+    ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]);
+    ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int64_t totalRdCostRef = rand();
+        int64_t totalUncodedCostRef = rand();
+        int64_t totalRdCostOpt = totalRdCostRef;
+        int64_t totalUncodedCostOpt = totalUncodedCostRef;
+        int64_t *psyScale = X265_MALLOC(int64_t, 1);
+        *psyScale = rand();
+
+        int index = rand() % 4;
+        uint32_t blkPos = trSize[index];
+        int cmp_size = 4 * MAX_TU_SIZE;
+
+        memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
+        memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
+
+        int index1 = rand() % TEST_CASES;
+
+        ref(short_test_buff[index1] + j, short_test_buff1[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, psyScale, blkPos);
+        checked(opt, short_test_buff[index1] + j, short_test_buff1[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, psyScale, blkPos);
+
+        X265_FREE(psyScale);
+        if (memcmp(ref_dest, opt_dest, cmp_size))
+            return false;
+
+        if (totalUncodedCostRef != totalUncodedCostOpt)
+            return false;
+
+        if (totalRdCostRef != totalRdCostOpt)
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
 
 bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt)
 {
@@ -473,6 +519,17 @@
             }
         }
     }
+    for (int i = 0; i < NUM_TR_SIZE; i++)
+    {
+        if (opt.cu[i].psyRdoQuant)
+        {
+            if (!check_psyRdoQuant_primitive(ref.cu[i].psyRdoQuant, opt.cu[i].psyRdoQuant))
+            {
+                printf("psyRdoQuant[%dx%d]: Failed!\n", 4 << i, 4 << i);
+                return false;
+            }
+        }
+    }
 
     for (int i = 0; i < NUM_TR_SIZE; i++)
     {
@@ -573,6 +630,19 @@
             REPORT_SPEEDUP(opt.cu[value].nonPsyRdoQuant, ref.cu[value].nonPsyRdoQuant, short_test_buff[0], opt_dest, &totalUncodedCost, &totalRdCost, 0);
         }
     }
+    for (int value = 0; value < NUM_TR_SIZE; value++)
+    {
+        if (opt.cu[value].psyRdoQuant)
+        {
+            ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
+            int64_t totalRdCost = 0;
+            int64_t totalUncodedCost = 0;
+            int64_t *psyScale = X265_MALLOC(int64_t, 1);
+            *psyScale = 0;
+            printf("psyRdoQuant[%dx%d]", 4 << value, 4 << value);
+            REPORT_SPEEDUP(opt.cu[value].psyRdoQuant, ref.cu[value].psyRdoQuant, short_test_buff[0], short_test_buff1[0], opt_dest, &totalUncodedCost, &totalRdCost, psyScale, 0);
+        }
+    }
 
     for (int value = 0; value < NUM_TR_SIZE; value++)
     {
diff -r 80775bda5ec1 -r 4e9f2efdfd09 source/test/mbdstharness.h
--- a/source/test/mbdstharness.h	Tue Jan 02 15:21:08 2018 +0530
+++ b/source/test/mbdstharness.h	Fri Dec 29 09:52:27 2017 +0530
@@ -51,11 +51,10 @@
     int     mintbuf2[MAX_TU_SIZE];
     int     mintbuf3[MAX_TU_SIZE];
     int     mintbuf4[MAX_TU_SIZE];
-
     int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE];
+    int16_t short_test_buff1[TEST_CASES][TEST_BUF_SIZE];
     int     int_test_buff[TEST_CASES][TEST_BUF_SIZE];
     int     int_idct_test_buff[TEST_CASES][TEST_BUF_SIZE];
-
     uint32_t mubuf1[MAX_TU_SIZE];
     uint32_t mubuf2[MAX_TU_SIZE];
     uint16_t mushortbuf1[MAX_TU_SIZE];
@@ -65,6 +64,7 @@
     bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt);
     bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt);
     bool check_nonPsyRdoQuant_primitive(nonPsyRdoQuant_t ref, nonPsyRdoQuant_t opt);
+    bool check_psyRdoQuant_primitive(psyRdoQuant_t ref, psyRdoQuant_t opt);
     bool check_quant_primitive(quant_t ref, quant_t opt);
     bool check_nquant_primitive(nquant_t ref, nquant_t opt);
     bool check_dct_primitive(dct_t ref, dct_t opt, intptr_t width);