[x265] [PATCH 02 of 13] x86 : AVX2 psyrdoquant primitive for all sizes

vignesh at multicorewareinc.com vignesh at multicorewareinc.com
Fri Jul 6 11:18:02 CEST 2018


# HG changeset patch
# User Jayashree <jayashree.c at multicorewareinc.com>
# Date 1523609472 -19800
#      Fri Apr 13 14:21:12 2018 +0530
# Node ID d55132eb42010fe0cbb95cd245d3d1fff69743c4
# Parent  c9f622347ce51cf90b593e8500ee5a40888c6f29
x86 : AVX2 psyrdoquant primitive for all sizes

diff -r c9f622347ce5 -r d55132eb4201 source/common/cpu.cpp
--- a/source/common/cpu.cpp	Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/cpu.cpp	Fri Apr 13 14:21:12 2018 +0530
@@ -58,6 +58,7 @@
 #endif // if X265_ARCH_ARM
 
 namespace X265_NS {
+static bool enable512 = false;
 const cpu_name_t cpu_names[] =
 {
 #if X265_ARCH_X86
@@ -122,10 +123,14 @@
 #pragma warning(disable: 4309) // truncation of constant value
 #endif
 
+bool detect512()
+{
+    return(enable512);
+}
 uint32_t cpu_detect(bool benableavx512 )
 {
-    uint32_t cpu = 0;
 
+    uint32_t cpu = 0; 
     uint32_t eax, ebx, ecx, edx;
     uint32_t vendor[4] = { 0 };
     uint32_t max_extended_cap, max_basic_cap;
@@ -189,7 +194,10 @@
                 if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
                 {
                     if ((ebx & 0xD0030000) == 0xD0030000)
+                    {
                         cpu |= X265_CPU_AVX512;
+                        enable512 = true;
+                    }
                 }
             }
         }
@@ -390,3 +398,4 @@
 
 #endif // if X265_ARCH_X86
 }
+
diff -r c9f622347ce5 -r d55132eb4201 source/common/cpu.h
--- a/source/common/cpu.h	Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/cpu.h	Fri Apr 13 14:21:12 2018 +0530
@@ -26,7 +26,6 @@
 #define X265_CPU_H
 
 #include "common.h"
-
 /* All assembly functions are prefixed with X265_NS (macro expanded) */
 #define PFX3(prefix, name) prefix ## _ ## name
 #define PFX2(prefix, name) PFX3(prefix, name)
@@ -51,6 +50,7 @@
 
 namespace X265_NS {
 uint32_t cpu_detect(bool);
+bool detect512();
 
 struct cpu_name_t
 {
diff -r c9f622347ce5 -r d55132eb4201 source/common/dct.cpp
--- a/source/common/dct.cpp	Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/dct.cpp	Fri Apr 13 14:21:12 2018 +0530
@@ -1027,6 +1027,47 @@
         blkPos += trSize;
     }
 }
+template<int log2TrSize>
+static void psyRdoQuant_c_1(int16_t *m_resiDctCoeff, /*int16_t  *m_fencDctCoeff, */ int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, /* int64_t *psyScale,*/ uint32_t blkPos)
+{
+	const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+	const int scaleBits = SCALE_BITS - 2 * transformShift;
+	const uint32_t trSize = 1 << log2TrSize;
+
+	for (int y = 0; y < MLS_CG_SIZE; y++)
+	{
+		for (int x = 0; x < MLS_CG_SIZE; x++)
+		{
+			int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
+			costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
+			*totalUncodedCost += costUncoded[blkPos + x];
+			*totalRdCost += costUncoded[blkPos + x];
+		}
+		blkPos += trSize;
+	}
+}
+template<int log2TrSize>
+static void psyRdoQuant_c_2(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
+{
+	const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+
+	const uint32_t trSize = 1 << log2TrSize;
+	int max = X265_MAX(0, (2 * transformShift + 1));
+
+	for (int y = 0; y < MLS_CG_SIZE; y++)
+	{
+		for (int x = 0; x < MLS_CG_SIZE; x++)
+		{
+			int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
+			int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
+			costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
+			*totalUncodedCost += costUncoded[blkPos + x];
+			*totalRdCost += costUncoded[blkPos + x];
+		}
+		blkPos += trSize;
+	}
+}
+
 namespace X265_NS {
 // x265 private namespace
 void setupDCTPrimitives_c(EncoderPrimitives& p)
@@ -1063,7 +1104,14 @@
     p.cu[BLOCK_8x8].copy_cnt   = copy_count<8>;
     p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
     p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
-
+	p.cu[BLOCK_4x4].psyRdoQuant_1p = psyRdoQuant_c_1<2>;
+	p.cu[BLOCK_4x4].psyRdoQuant_2p = psyRdoQuant_c_2<2>;
+	p.cu[BLOCK_8x8].psyRdoQuant_1p = psyRdoQuant_c_1<3>;
+	p.cu[BLOCK_8x8].psyRdoQuant_2p = psyRdoQuant_c_2<3>;
+	p.cu[BLOCK_16x16].psyRdoQuant_1p = psyRdoQuant_c_1<4>;
+	p.cu[BLOCK_16x16].psyRdoQuant_2p = psyRdoQuant_c_2<4>;
+	p.cu[BLOCK_32x32].psyRdoQuant_1p = psyRdoQuant_c_1<5>;
+	p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_c_2<5>;
     p.scanPosLast = scanPosLast_c;
     p.findPosFirstLast = findPosFirstLast_c;
     p.costCoeffNxN = costCoeffNxN_c;
diff -r c9f622347ce5 -r d55132eb4201 source/common/primitives.h
--- a/source/common/primitives.h	Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/primitives.h	Fri Apr 13 14:21:12 2018 +0530
@@ -225,7 +225,8 @@
 typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
 typedef void(*nonPsyRdoQuant_t)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
 typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
-
+typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t blkPos);
+typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -300,6 +301,8 @@
         intra_pred_t    intra_pred[NUM_INTRA_MODE];
         nonPsyRdoQuant_t nonPsyRdoQuant;
         psyRdoQuant_t    psyRdoQuant;
+		psyRdoQuant_t1   psyRdoQuant_1p;
+		psyRdoQuant_t2   psyRdoQuant_2p;
     }
     cu[NUM_CU_SIZES];
     /* These remaining primitives work on either fixed block sizes or take
diff -r c9f622347ce5 -r d55132eb4201 source/common/quant.cpp
--- a/source/common/quant.cpp	Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/quant.cpp	Fri Apr 13 14:21:12 2018 +0530
@@ -723,7 +723,14 @@
             X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
             uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
             uint32_t blkPos      = codeParams.scan[scanPosBase];
-            primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
+            bool enable512 = detect512();
+            if (enable512)
+                primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
+            else
+            {
+                primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff,  costUncoded, &totalUncodedCost, &totalRdCost,blkPos);
+                primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
+            }
         }
     }
     else
@@ -798,7 +805,15 @@
             uint32_t blkPos = codeParams.scan[scanPosBase];
             if (usePsyMask)
             {
-                primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
+                bool enable512 = detect512();
+                
+                if (enable512)
+                    primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
+                else
+                {
+                    primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
+                    primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
+                }
                 blkPos = codeParams.scan[scanPosBase];
                 for (int y = 0; y < MLS_CG_SIZE; y++)
                 {
diff -r c9f622347ce5 -r d55132eb4201 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Apr 13 14:21:12 2018 +0530
@@ -2314,6 +2314,10 @@
         p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx2);
         p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx2);
         p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx2);
+        p.cu[BLOCK_4x4].psyRdoQuant_1p = PFX(psyRdoQuant_1p4_avx2);
+        p.cu[BLOCK_8x8].psyRdoQuant_1p = PFX(psyRdoQuant_1p8_avx2);
+        p.cu[BLOCK_16x16].psyRdoQuant_1p = PFX(psyRdoQuant_1p16_avx2);
+        p.cu[BLOCK_32x32].psyRdoQuant_1p = PFX(psyRdoQuant_1p32_avx2);
 
         /* TODO: This kernel needs to be modified to work with HIGH_BIT_DEPTH only 
         p.planeClipAndMax = PFX(planeClipAndMax_avx2); */
@@ -4697,6 +4701,10 @@
         p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx2);
         p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx2);
         p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx2);
+        p.cu[BLOCK_4x4].psyRdoQuant_1p = PFX(psyRdoQuant_1p4_avx2);
+        p.cu[BLOCK_8x8].psyRdoQuant_1p = PFX(psyRdoQuant_1p8_avx2);
+        p.cu[BLOCK_16x16].psyRdoQuant_1p = PFX(psyRdoQuant_1p16_avx2);
+        p.cu[BLOCK_32x32].psyRdoQuant_1p = PFX(psyRdoQuant_1p32_avx2);
 
     }
     if (cpuMask & X265_CPU_AVX512)
diff -r c9f622347ce5 -r d55132eb4201 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/x86/dct8.asm	Fri Apr 13 14:21:12 2018 +0530
@@ -7353,4 +7353,289 @@
     movq           [r2],       xm6
     movq           [r3],       xm7
     RET
+
+INIT_YMM avx2
+cglobal psyRdoQuant_1p4, 5, 9, 16
+    mov            r4d,        r4m
+    lea             r0,        [r0 + 2 * r4]
+    lea             r4,        [4 * r4]
+    lea             r1,        [r1 + 2 * r4]
+    movq           xm0,        [r2]
+    movq           xm1,        [r3]
+
+%if BIT_DEPTH == 12
+    mov            r5,         [tab_nonpsyRdo12]                 ; scaleBits
+%elif BIT_DEPTH == 10
+    mov            r5,         [tab_nonpsyRdo10]
+%elif BIT_DEPTH == 8
+    mov            r5,         [tab_nonpsyRdo8]
+%else
+    %error Unsupported BIT_DEPTH!
 %endif
+    movq           xm2,        r5
+    vpxor           m4,        m4
+    vpxor           m3,        m3
+    vpxor           m13,       m13
+
+    vpmovsxwd                  m6,        [r0]
+    vcvtdq2pd                  m9,        xm6
+    vfmadd213pd                m9,        m9,             m3
+    vcvtpd2dq                  xm8,       m9
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                      m4,        m13
+    movu                       [r1],       m13
+
+    vpmovsxwd                 m6,        [r0 + 8]
+    vcvtdq2pd                 m9,        xm6
+    vfmadd213pd               m9,        m9,             m3
+    vcvtpd2dq                 xm8,       m9
+    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                     m4,        m13
+    movu                      [r1 + 32], m13
+
+    vpmovsxwd                 m6,        [r0 + 16]
+    vcvtdq2pd                 m9,        xm6
+    vfmadd213pd               m9,        m9,             m3
+    vcvtpd2dq                 xm8,       m9
+    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                     m4,        m13
+    movu                      [r1 + 64], m13
+
+    vpmovsxwd                 m6,        [r0 +24]
+    vcvtdq2pd                 m9,        xm6
+    vfmadd213pd               m9,        m9,             m3
+    vcvtpd2dq                 xm8,       m9
+    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int 
+    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits
+    paddq                     m4,        m13
+    movu                      [r1 + 96], m13
+
+
+    vextracti128              xm2,       m4,            1
+    paddq                     xm4,       xm2
+    punpckhqdq                xm2,       xm4,            xm3
+    paddq                     xm4,       xm2
+
+    paddq                     xm0,       xm4
+    paddq                     xm1,       xm4
+
+    movq                      [r2],      xm0
+    movq                      [r3],      xm1
+    RET
+INIT_YMM avx2
+cglobal psyRdoQuant_1p8, 7, 9, 16
+    mov            r4d,        r4m
+    lea             r0,        [r0 + 2 * r4]
+    lea             r4,        [4 * r4]
+    lea             r1,        [r1 + 2 * r4]
+    movq           xm0,        [r2]
+    movq           xm1,        [r3]
+%if BIT_DEPTH == 12
+    mov            r5,         [tab_nonpsyRdo12 +8]                 ; scaleBits
+%elif BIT_DEPTH == 10
+    mov            r5,         [tab_nonpsyRdo10 +8]
+%elif BIT_DEPTH == 8
+    mov            r5,         [tab_nonpsyRdo8 + 8 ]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+    movq           xm2,        r5
+    vpxor           m4,        m4
+    vpxor           m3,        m3
+    vpxor           m13,       m13
+
+
+    vpmovsxwd                  m6,        [r0]
+    vcvtdq2pd                  m9,        xm6
+    vfmadd213pd                m9,        m9,             m3
+    vcvtpd2dq                  xm8,       m9
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                      m4,        m13
+    movu                       [r1],       m13
+
+    vpmovsxwd                  m6,        [r0 + 16]
+    vcvtdq2pd                  m9,        xm6
+    vfmadd213pd                m9,        m9,             m3
+    vcvtpd2dq                  xm8,       m9
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                      m4,        m13
+    movu                       [r1 + 64],       m13
+
+    vpmovsxwd                  m6,        [r0 +32]
+    vcvtdq2pd                  m9,        xm6
+    vfmadd213pd                m9,        m9,             m3
+    vcvtpd2dq                  xm8,       m9
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                      m4,        m13
+    movu                       [r1 +128],       m13
+
+    vpmovsxwd                  m6,        [r0 + 48]
+    vcvtdq2pd                  m9,        xm6
+    vfmadd213pd                m9,        m9,             m3
+    vcvtpd2dq                  xm8,       m9
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                      m4,        m13
+    movu                       [r1 + 192],       m13
+
+    vextracti128              xm2,       m4,            1
+    paddq                     xm4,       xm2
+    punpckhqdq                xm2,       xm4,            xm3
+    paddq                     xm4,       xm2
+
+    paddq                     xm0,       xm4
+    paddq                     xm1,       xm4
+
+    movq                      [r2],      xm0
+    movq                      [r3],      xm1
+    RET
+
+INIT_YMM avx2
+cglobal psyRdoQuant_1p16, 7, 9, 16
+    mov            r4d,        r4m
+    lea             r0,        [r0 + 2 * r4]
+    lea             r4,        [4 * r4]
+    lea             r1,        [r1 + 2 * r4]
+    movq           xm0,        [r2]
+    movq           xm1,        [r3]
+%if BIT_DEPTH == 12
+    mov            r5,         [tab_nonpsyRdo12 + 16]                 ; scaleBits
+%elif BIT_DEPTH == 10
+    mov            r5,         [tab_nonpsyRdo10 + 16]
+%elif BIT_DEPTH == 8
+    mov            r5,         [tab_nonpsyRdo8 + 16 ]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+    movq           xm2,        r5
+    vpxor           m4,        m4
+    vpxor           m3,        m3
+    vpxor           m13,       m13
+
+    vpmovsxwd                  m6,        [r0]
+    vcvtdq2pd                  m9,        xm6
+    vfmadd213pd                m9,        m9,             m3
+    vcvtpd2dq                  xm8,       m9
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                      m4,        m13
+    movu                       [r1],       m13
+
+    vpmovsxwd                  m6,        [r0 + mmsize]
+
+    vcvtdq2pd                  m9,        xm6
+    vfmadd213pd                m9,        m9,             m3
+    vcvtpd2dq                  xm8,       m9
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                      m4,        m13
+    movu                       [r1 + 4*mmsize],       m13
+
+    vpmovsxwd                  m6,        [r0 + 2 * mmsize]
+    vcvtdq2pd                  m9,        xm6
+    vfmadd213pd                m9,        m9,             m3
+    vcvtpd2dq                  xm8,       m9
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                      m4,        m13
+    movu                       [r1 + 8*mmsize],       m13
+
+    vpmovsxwd                  m6,        [r0 + 3 * mmsize]
+    vcvtdq2pd                  m9,        xm6
+    vfmadd213pd                m9,        m9,             m3
+    vcvtpd2dq                  xm8,       m9
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                      m4,        m13
+    movu                       [r1 + 12*mmsize],       m13
+
+    vextracti128              xm2,       m4,            1
+    paddq                     xm4,       xm2
+    punpckhqdq                xm2,       xm4,            xm3
+    paddq                     xm4,       xm2
+
+    paddq                     xm0,       xm4
+    paddq                     xm1,       xm4
+
+    movq                      [r2],      xm0
+    movq                      [r3],      xm1
+    RET
+
+INIT_YMM avx2
+cglobal psyRdoQuant_1p32, 7, 9, 16
+   mov            r4d,        r4m
+    lea             r0,        [r0 + 2 * r4]
+    lea             r4,        [4 * r4]
+    lea             r1,        [r1 + 2 * r4]
+    movq           xm0,        [r2]
+    movq           xm1,        [r3]
+%if BIT_DEPTH == 12
+    mov            r5,         [tab_nonpsyRdo12 + 24]                 ; scaleBits
+%elif BIT_DEPTH == 10
+    mov            r5,         [tab_nonpsyRdo10 + 24]
+%elif BIT_DEPTH == 8
+    mov            r5,         [tab_nonpsyRdo8 + 24]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+    movq           xm2,        r5
+    vpxor           m4,        m4
+    vpxor           m3,        m3
+    vpxor           m13,       m13
+
+
+    vpmovsxwd                  m6,        [r0]
+    vcvtdq2pd                  m9,        xm6
+    vfmadd213pd                m9,        m9,             m3
+    vcvtpd2dq                  xm8,       m9
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                      m4,        m13
+    movu                       [r1],       m13
+
+    vpmovsxwd                  m6,        [r0 + 2 * mmsize]
+    vcvtdq2pd                  m9,        xm6
+    vfmadd213pd                m9,        m9,             m3
+    vcvtpd2dq                  xm8,       m9
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                      m4,        m13
+    movu                       [r1 + 8 * mmsize],       m13
+
+    vpmovsxwd                  m6,        [r0 + 4 * mmsize]
+    vcvtdq2pd                  m9,        xm6
+    vfmadd213pd                m9,        m9,             m3
+    vcvtpd2dq                  xm8,       m9
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                      m4,        m13
+    movu                       [r1 + 16 * mmsize],       m13
+
+    vpmovsxwd                  m6,        [r0 + 6 * mmsize]
+    vcvtdq2pd                  m9,        xm6
+    vfmadd213pd                m9,        m9,             m3
+    vcvtpd2dq                  xm8,       m9
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
+    paddq                      m4,        m13
+    movu                       [r1  + 24 *mmsize],       m13
+
+    vextracti128              xm2,       m4,            1
+    paddq                     xm4,       xm2
+    punpckhqdq                xm2,       xm4,            xm3
+    paddq                     xm4,       xm2
+
+    paddq                     xm0,       xm4
+    paddq                     xm1,       xm4
+
+    movq                      [r2],      xm0
+    movq                      [r3],      xm1
+    RET
+
+%endif
diff -r c9f622347ce5 -r d55132eb4201 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Wed Mar 07 09:34:37 2018 +0530
+++ b/source/common/x86/dct8.h	Fri Apr 13 14:21:12 2018 +0530
@@ -37,6 +37,8 @@
 FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx512, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
 FUNCDEF_TU_S2(void, psyRdoQuant, avx512, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos);
 FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx2, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
+FUNCDEF_TU_S2(void, psyRdoQuant_1p, avx2, int16_t* m_resiDctCoeff,  int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost,  uint32_t blkPos);
+FUNCDEF_TU_S2(void, psyRdoQuant_2p, avx2, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos);
 
 void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
diff -r c9f622347ce5 -r d55132eb4201 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp	Wed Mar 07 09:34:37 2018 +0530
+++ b/source/test/mbdstharness.cpp	Fri Apr 13 14:21:12 2018 +0530
@@ -370,7 +370,49 @@
 
     return true;
 }
+bool MBDstHarness::check_psyRdoQuant_primitive_avx2(psyRdoQuant_t1 ref, psyRdoQuant_t1 opt)
+{
+    int j = 0;
+    int trSize[4] = { 16, 64, 256, 1024 };
 
+    ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]);
+    ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int64_t totalRdCostRef = rand();
+        int64_t totalUncodedCostRef = rand();
+        int64_t totalRdCostOpt = totalRdCostRef;
+        int64_t totalUncodedCostOpt = totalUncodedCostRef;
+
+        int index = rand() % 4;
+        uint32_t blkPos =  trSize[index];
+        int cmp_size = 4 * MAX_TU_SIZE;
+
+        memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
+        memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
+
+        int index1 = rand() % TEST_CASES;
+
+        ref(short_test_buff[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, blkPos);
+        checked(opt, short_test_buff[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, blkPos);
+
+        
+        if (memcmp(ref_dest, opt_dest, cmp_size))
+            return false;
+
+        if (totalUncodedCostRef != totalUncodedCostOpt)
+            return false;
+
+        if (totalRdCostRef != totalRdCostOpt)
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
 bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt)
 {
     int j = 0;
@@ -530,7 +572,17 @@
             }
         }
     }
-
+    for (int i = 0; i < NUM_TR_SIZE; i++)
+    {
+        if (opt.cu[i].psyRdoQuant_1p)
+        {
+            if (!check_psyRdoQuant_primitive_avx2(ref.cu[i].psyRdoQuant_1p, opt.cu[i].psyRdoQuant_1p))
+            {
+                printf("psyRdoQuant_1p[%dx%d]: Failed!\n", 4 << i, 4 << i);
+                return false;
+            }
+        }
+    }
     for (int i = 0; i < NUM_TR_SIZE; i++)
     {
         if (opt.cu[i].count_nonzero)
@@ -643,7 +695,17 @@
             REPORT_SPEEDUP(opt.cu[value].psyRdoQuant, ref.cu[value].psyRdoQuant, short_test_buff[0], short_test_buff1[0], opt_dest, &totalUncodedCost, &totalRdCost, psyScale, 0);
         }
     }
-
+    for (int value = 0; value < NUM_TR_SIZE; value++)
+    {
+        if (opt.cu[value].psyRdoQuant_1p)
+        {
+            ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
+            int64_t totalRdCost = 0;
+            int64_t totalUncodedCost = 0;
+            printf("psyRdoQuant_1p[%dx%d]", 4 << value, 4 << value);
+            REPORT_SPEEDUP(opt.cu[value].psyRdoQuant_1p, ref.cu[value].psyRdoQuant_1p, short_test_buff[0], opt_dest, &totalUncodedCost, &totalRdCost, 0);
+        }
+    }
     for (int value = 0; value < NUM_TR_SIZE; value++)
     {
         if (opt.cu[value].count_nonzero)
diff -r c9f622347ce5 -r d55132eb4201 source/test/mbdstharness.h
--- a/source/test/mbdstharness.h	Wed Mar 07 09:34:37 2018 +0530
+++ b/source/test/mbdstharness.h	Fri Apr 13 14:21:12 2018 +0530
@@ -71,6 +71,7 @@
     bool check_idct_primitive(idct_t ref, idct_t opt, intptr_t width);
     bool check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt);
     bool check_denoise_dct_primitive(denoiseDct_t ref, denoiseDct_t opt);
+    bool check_psyRdoQuant_primitive_avx2(psyRdoQuant_t1 ref, psyRdoQuant_t1 opt);
 
 public:
 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: x265-02.patch
Type: text/x-patch
Size: 28044 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20180706/a54d8bb3/attachment-0001.bin>


More information about the x265-devel mailing list