[x265] [PATCH x265] Add AVX2 assembly code for ssimDistortion primitive.

Akil akil at multicorewareinc.com
Tue Mar 5 05:33:27 CET 2019


# HG changeset patch
# User Akil Ayyappan<akil at multicorewareinc.com>
# Date 1551251102 -19800
#      Wed Feb 27 12:35:02 2019 +0530
# Node ID d12a4caf7963fd47d646040689ad5f02754ad879
# Parent  cb3e172a5f51c6a4bf8adb7953fe53277f5a1979
x86: ssimDistortion primitive

This patch adds AVX2 assembly for this primitive.

|---------|-----------|-----------------|-----------------|
|  Size   |Performance|AVX2 clock cycles|CPP clock cycles |
|---------|-----------|-----------------|-----------------|
| [4x4]   |   3.52x   |     264.43      |     932.05      |
| [8x8]   |   5.11x   |     619.24      |     3163.56     |
| [16x16] |   5.44x   |     2114.00     |     11490.52    |
| [32x32] |   6.01x   |     7589.70     |     45608.01    |
| [64x64] |   6.70x   |     27859.21    |     186634.25   |
|---------|-----------|-----------------|-----------------|

diff -r cb3e172a5f51 -r d12a4caf7963 source/common/pixel.cpp
--- a/source/common/pixel.cpp Tue Feb 19 20:20:35 2019 +0530
+++ b/source/common/pixel.cpp Wed Feb 27 12:35:02 2019 +0530
@@ -934,6 +934,31 @@
     }
 }

+template<int log2TrSize>
+static void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel*
recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
+{
+    *ssBlock = 0;
+    const uint32_t trSize = 1 << log2TrSize;
+    for (int y = 0; y < trSize; y++)
+    {
+        for (int x = 0; x < trSize; x++)
+        {
+            int temp = fenc[y * fStride + x] - recon[y * rstride + x]; //
copy of residual coeff
+            *ssBlock += temp * temp;
+        }
+    }
+
+    *ac_k = 0;
+    for (int block_yy = 0; block_yy < trSize; block_yy += 1)
+    {
+        for (int block_xx = 0; block_xx < trSize; block_xx += 1)
+        {
+            uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;
+            *ac_k += temp * temp;
+        }
+    }
+}
+
 #if HIGH_BIT_DEPTH
 static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int
height, uint64_t *outsum,
                                const pixel minPix, const pixel maxPix)
@@ -1283,5 +1308,11 @@
     p.propagateCost = estimateCUPropagateCost;
     p.fix8Unpack = cuTreeFix8Unpack;
     p.fix8Pack = cuTreeFix8Pack;
+
+    p.cu[BLOCK_4x4].ssimDist = ssimDist_c<2>;
+    p.cu[BLOCK_8x8].ssimDist = ssimDist_c<3>;
+    p.cu[BLOCK_16x16].ssimDist = ssimDist_c<4>;
+    p.cu[BLOCK_32x32].ssimDist = ssimDist_c<5>;
+    p.cu[BLOCK_64x64].ssimDist = ssimDist_c<6>;
 }
 }
diff -r cb3e172a5f51 -r d12a4caf7963 source/common/primitives.h
--- a/source/common/primitives.h Tue Feb 19 20:20:35 2019 +0530
+++ b/source/common/primitives.h Wed Feb 27 12:35:02 2019 +0530
@@ -227,6 +227,7 @@
 typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t
*m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t
*totalRdCost, int64_t *psyScale, uint32_t blkPos);
 typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t
*costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t
blkPos);
 typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t
*m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t
*totalRdCost, int64_t *psyScale, uint32_t blkPos);
+typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const
pixel *recon,  intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t
*ac_k);
 /* Function pointers to optimized encoder primitives. Each pointer can
reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function
*/
 struct EncoderPrimitives
@@ -303,6 +304,7 @@
         psyRdoQuant_t    psyRdoQuant;
  psyRdoQuant_t1   psyRdoQuant_1p;
  psyRdoQuant_t2   psyRdoQuant_2p;
+        ssimDistortion_t ssimDist;
     }
     cu[NUM_CU_SIZES];
     /* These remaining primitives work on either fixed block sizes or take
diff -r cb3e172a5f51 -r d12a4caf7963 source/common/quant.cpp
--- a/source/common/quant.cpp Tue Feb 19 20:20:35 2019 +0530
+++ b/source/common/quant.cpp Wed Feb 27 12:35:02 2019 +0530
@@ -501,15 +501,8 @@

     // Calculation of (X(k) - Y(k)) * (X(k) - Y(k)), AC
     ssBlock = 0;
-    for (int y = 0; y < trSize; y++)
-    {
-        for (int x = 0; x < trSize; x++)
-        {
-            int temp = fenc[y * fStride + x] - recon[y * rstride + x]; //
copy of residual coeff
-            ssBlock += temp * temp;
-        }
-    }
-
+    uint64_t ac_k = 0;
+    primitives.cu[log2TrSize - 2].ssimDist(fenc, fStride, recon, rstride,
&ssBlock, shift, &ac_k);
     ssAc = ssBlock - ssDc;

     // 1. Calculation of fdc'
@@ -535,15 +528,6 @@
     uint64_t fAc_num = 0;

     // 2. Calculate ac component
-    uint64_t ac_k = 0;
-    for (int block_yy = 0; block_yy < trSize; block_yy += 1)
-    {
-        for (int block_xx = 0; block_xx < trSize; block_xx += 1)
-        {
-            uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;
-            ac_k += temp * temp;
-        }
-    }
     ac_k -= dc_k;

     double s = 1 + 0.005 * cu.m_qp[absPartIdx];
diff -r cb3e172a5f51 -r d12a4caf7963 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Feb 19 20:20:35 2019 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Feb 27 12:35:02 2019 +0530
@@ -2319,6 +2319,12 @@
         p.cu[BLOCK_16x16].psyRdoQuant_1p = PFX(psyRdoQuant_1p16_avx2);
         p.cu[BLOCK_32x32].psyRdoQuant_1p = PFX(psyRdoQuant_1p32_avx2);

+        p.cu[BLOCK_4x4].ssimDist = PFX(ssimDist4_avx2);
+        p.cu[BLOCK_8x8].ssimDist = PFX(ssimDist8_avx2);
+        p.cu[BLOCK_16x16].ssimDist = PFX(ssimDist16_avx2);
+        p.cu[BLOCK_32x32].ssimDist = PFX(ssimDist32_avx2);
+        p.cu[BLOCK_64x64].ssimDist = PFX(ssimDist64_avx2);
+
         /* TODO: This kernel needs to be modified to work with
HIGH_BIT_DEPTH only
         p.planeClipAndMax = PFX(planeClipAndMax_avx2); */

@@ -4706,6 +4712,12 @@
         p.cu[BLOCK_16x16].psyRdoQuant_1p = PFX(psyRdoQuant_1p16_avx2);
         p.cu[BLOCK_32x32].psyRdoQuant_1p = PFX(psyRdoQuant_1p32_avx2);

+        p.cu[BLOCK_4x4].ssimDist = PFX(ssimDist4_avx2);
+        p.cu[BLOCK_8x8].ssimDist = PFX(ssimDist8_avx2);
+        p.cu[BLOCK_16x16].ssimDist = PFX(ssimDist16_avx2);
+        p.cu[BLOCK_32x32].ssimDist = PFX(ssimDist32_avx2);
+        p.cu[BLOCK_64x64].ssimDist = PFX(ssimDist64_avx2);
+
     }
     if (cpuMask & X265_CPU_AVX512)
     {
diff -r cb3e172a5f51 -r d12a4caf7963 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Feb 19 20:20:35 2019 +0530
+++ b/source/common/x86/pixel-a.asm Wed Feb 27 12:35:02 2019 +0530
@@ -73,6 +73,16 @@
 cextern pb_movemask_32
 cextern pw_pixel_max

+%if BIT_DEPTH == 12
+    %define     SSIMRD_SHIFT          4
+%elif BIT_DEPTH == 10
+    %define     SSIMRD_SHIFT          2
+%elif BIT_DEPTH == 8
+    %define     SSIMRD_SHIFT          0
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
 ;=============================================================================
 ; SATD
 ;=============================================================================
@@ -360,6 +370,24 @@
     RET
 %endmacro

+%macro SSIM_RD_COL 2
+    vpsrld         m6,         m0,        SSIMRD_SHIFT
+    vpsubd         m0,         m1
+
+    vpmuldq        m2,         m0,        m0
+    vpsrldq        m0,         m0,        4
+    vpmuldq        m0,         m0,        m0
+    vpaddq         m0,         m2
+
+    vpmuldq        m2,         m6,        m6
+    vpsrldq        m6,         m6,        4
+    vpmuldq        m6,         m6,        m6
+    vpaddq         m6,         m2
+
+    vpaddq         m4,         m0
+    vpaddq         m7,         m6
+%endmacro
+
 ; FIXME avoid the spilling of regs to hold 3*stride.
 ; for small blocks on x86_32, modify pixel pointer instead.

@@ -15883,3 +15911,395 @@
     RET
 %endif
 %endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
+
+;template<int log2TrSize>
+;static void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel*
recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
+;{
+;    *ssBlock = 0;
+;    const uint32_t trSize = 1 << log2TrSize;
+;    for (int y = 0; y < trSize; y++)
+;    {
+;        for (int x = 0; x < trSize; x++)
+;        {
+;            int temp = fenc[y * fStride + x] - recon[y * rstride + x]; //
copy of residual coeff
+;            *ssBlock += temp * temp;
+;        }
+;    }
+;
+;    *ac_k = 0;
+;    for (int block_yy = 0; block_yy < trSize; block_yy += 1)
+;    {
+;        for (int block_xx = 0; block_xx < trSize; block_xx += 1)
+;        {
+;            uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;
+;            *ac_k += temp * temp;
+;        }
+;    }
+;}
+;-----------------------------------------------------------------------------------------------------------------
+; void ssimDist_c(const pixel* fenc, uint32_t fStride, const pixel* recon,
intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k)
+;-----------------------------------------------------------------------------------------------------------------
+
+INIT_YMM avx2
+cglobal ssimDist4, 7, 8, 8
+    mov            r7d,        4
+    vpxor          m4,         m4                              ;ssBlock
+    vpxor          m3,         m3
+    vpxor          m7,         m7                              ;ac_k
+.row:
+%if HIGH_BIT_DEPTH
+    vpmovzxwq      m0,        [r0]                             ;fenc
+    vpmovzxwq      m1,        [r2]                             ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbq      m0,        [r0]
+    vpmovzxbq      m1,        [r2]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+    vpsrlq         m6,        m0,        SSIMRD_SHIFT
+    vpsubq         m0,        m1
+    vpmuldq        m0,        m0,        m0
+    vpmuldq        m6,        m6,        m6
+    vpaddq         m4,        m0
+    vpaddq         m7,        m6
+
+%if HIGH_BIT_DEPTH
+    lea            r0,        [r0 + 2 * r1]
+    lea            r2,        [r2 + 2 * r3]
+%else
+    lea            r0,        [r0 + r1]
+    lea            r2,        [r2 + r3]
+%endif
+    dec            r7d
+    jnz           .row
+    vextracti128   xm5,       m4,        1
+    vpaddq         xm4,       xm5
+    punpckhqdq     xm2,       xm4,       xm3
+    paddq          xm4,       xm2
+
+    vextracti128   xm5,       m7,        1
+    vpaddq         xm7,       xm5
+    punpckhqdq     xm2,       xm7,       xm3
+    paddq          xm7,       xm2
+
+    movq          [r4],       xm4
+    movq          [r6],       xm7
+    RET
+
+
+INIT_YMM avx2
+cglobal ssimDist8, 7, 8, 8
+    mov            r7d,        8
+    vpxor          m4,         m4                              ;ssBlock
+    vpxor          m3,         m3
+    vpxor          m7,         m7                              ;ac_k
+.row:
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,        [r0]                             ;fenc
+    vpmovzxwd      m1,        [r2]                             ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,        [r0]
+    vpmovzxbd      m1,        [r2]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+%if HIGH_BIT_DEPTH
+    lea            r0,         [r0 + 2 * r1]
+    lea            r2,         [r2 + 2 * r3]
+%else
+    lea            r0,         [r0 + r1]
+    lea            r2,         [r2 + r3]
+%endif
+    dec            r7d
+    jnz            .row
+    vextracti128   xm5,        m4,        1
+    vpaddq         xm4,        xm5
+    punpckhqdq     xm2,        xm4,       xm3
+    paddq          xm4,        xm2
+
+    vextracti128   xm5,        m7,       1
+    vpaddq         xm7,        xm5
+    punpckhqdq     xm2,        xm7,      xm3
+    paddq          xm7,        xm2
+
+    movq           [r4],       xm4
+    movq           [r6],       xm7
+    RET
+
+
+INIT_YMM avx2
+cglobal ssimDist16, 7, 8, 8
+    mov            r7d,         16
+    vpxor          m4,          m4                                ;ssBlock
+    vpxor          m3,          m3
+    vpxor          m7,          m7                                ;ac_k
+.row:
+;Col 1-8
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0]                              ;fenc
+    vpmovzxwd      m1,          [r2]                              ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0]
+    vpmovzxbd      m1,          [r2]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+;Col 9-16
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 16]                         ;fenc
+    vpmovzxwd      m1,          [r2 + 16]                         ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 8]
+    vpmovzxbd      m1,          [r2 + 8]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+%if HIGH_BIT_DEPTH
+    lea            r0,         [r0 + 2 * r1]
+    lea            r2,         [r2 + 2 * r3]
+%else
+    lea            r0,         [r0 + r1]
+    lea            r2,         [r2 + r3]
+%endif
+    dec            r7d
+    jnz           .row
+    vextracti128   xm5,        m4,        1
+    vpaddq         xm4,        xm5
+    punpckhqdq     xm2,        xm4,       xm3
+    paddq          xm4,        xm2
+
+    vextracti128   xm5,        m7,        1
+    vpaddq         xm7,        xm5
+    punpckhqdq     xm2,        xm7,       xm3
+    paddq          xm7,        xm2
+
+    movq           [r4],       xm4
+    movq           [r6],       xm7
+    RET
+
+
+INIT_YMM avx2
+cglobal ssimDist32, 7, 8, 8
+    mov            r7d,        32
+    vpxor          m4,         m4                              ;ssBlock
+    vpxor          m3,         m3
+    vpxor          m7,         m7                              ;ac_k
+.row:
+;Col 1-8
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,         [r0]                            ;fenc
+    vpmovzxwd      m1,         [r2]                            ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,         [r0]
+    vpmovzxbd      m1,         [r2]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+;Col 9-16
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 16]                      ;fenc
+    vpmovzxwd      m1,          [r2 + 16]                      ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 8]
+    vpmovzxbd      m1,          [r2 + 8]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+;Col 17-24
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 32]                      ;fenc
+    vpmovzxwd      m1,          [r2 + 32]                      ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 16]
+    vpmovzxbd      m1,          [r2 + 16]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+;Col 25-32
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 48]                      ;fenc
+    vpmovzxwd      m1,          [r2 + 48]                      ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 24]
+    vpmovzxbd      m1,          [r2 + 24]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+%if HIGH_BIT_DEPTH
+    lea            r0,          [r0 + 2 * r1]
+    lea            r2,          [r2 + 2 * r3]
+%else
+    lea            r0,          [r0 + r1]
+    lea            r2,          [r2 + r3]
+%endif
+    dec            r7d
+    jnz           .row
+    vextracti128   xm5,         m4,        1
+    vpaddq         xm4,         xm5
+    punpckhqdq     xm2,         xm4,       xm3
+    paddq          xm4,         xm2
+
+    vextracti128   xm5,         m7,        1
+    vpaddq         xm7,         xm5
+    punpckhqdq     xm2,         xm7,       xm3
+    paddq          xm7,         xm2
+
+    movq           [r4],        xm4
+    movq           [r6],        xm7
+    RET
+
+
+INIT_YMM avx2
+cglobal ssimDist64, 7, 8, 8
+    mov            r7d,         64
+    vpxor          m4,          m4                             ;ssBlock
+    vpxor          m3,          m3
+    vpxor          m7,          m7                             ;ac_k
+.row:
+;Col 1-8
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0]                           ;fenc
+    vpmovzxwd      m1,          [r2]                           ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0]
+    vpmovzxbd      m1,          [r2]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+;Col 9-16
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 16]                      ;fenc
+    vpmovzxwd      m1,          [r2 + 16]                      ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 8]
+    vpmovzxbd      m1,          [r2 + 8]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+;Col 17-24
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 32]                      ;fenc
+    vpmovzxwd      m1,          [r2 + 32]                      ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 16]
+    vpmovzxbd      m1,          [r2 + 16]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+;Col 25-32
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 48]                      ;fenc
+    vpmovzxwd      m1,          [r2 + 48]                      ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 24]
+    vpmovzxbd      m1,          [r2 + 24]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+;Col 33-40
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 64]                      ;fenc
+    vpmovzxwd      m1,          [r2 + 64]                      ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 32]
+    vpmovzxbd      m1,          [r2 + 32]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+;Col 41-48
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 80]                      ;fenc
+    vpmovzxwd      m1,          [r2 + 80]                      ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 40]
+    vpmovzxbd      m1,          [r2 + 40]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+;Col 49-56
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 96]                      ;fenc
+    vpmovzxwd      m1,          [r2 + 96]                      ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 48]
+    vpmovzxbd      m1,          [r2 + 48]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+;Col 57-64
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 112]                     ;fenc
+    vpmovzxwd      m1,          [r2 + 112]                     ;recon
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 56]
+    vpmovzxbd      m1,          [r2 + 56]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    SSIM_RD_COL    m0,          m1
+
+%if HIGH_BIT_DEPTH
+    lea            r0,          [r0 + 2 * r1]
+    lea            r2,          [r2 + 2 * r3]
+%else
+    lea            r0,          [r0 + r1]
+    lea            r2,          [r2 + r3]
+%endif
+    dec            r7d
+    jnz            .row
+    vextracti128   xm5,          m4,        1
+    vpaddq         xm4,          xm5
+    punpckhqdq     xm2,          xm4,       xm3
+    paddq          xm4,          xm2
+
+    vextracti128   xm5,          m7,        1
+    vpaddq         xm7,          xm5
+    punpckhqdq     xm2,          xm7,       xm3
+    paddq          xm7,          xm2
+
+    movq           [r4],         xm4
+    movq           [r6],         xm7
+    RET
diff -r cb3e172a5f51 -r d12a4caf7963 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Tue Feb 19 20:20:35 2019 +0530
+++ b/source/common/x86/pixel.h Wed Feb 27 12:35:02 2019 +0530
@@ -60,7 +60,8 @@
     FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*,
intptr_t); \
     FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
     FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t
sstride, const pixel* recon, intptr_t rstride); \
-    FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t
sstride, const int16_t* recon, intptr_t rstride)
+    FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t
sstride, const int16_t* recon, intptr_t rstride); \
+    FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t
fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int
shift, uint64_t *ac_k)

 DECL_PIXELS(mmx);
 DECL_PIXELS(mmx2);
diff -r cb3e172a5f51 -r d12a4caf7963 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Feb 19 20:20:35 2019 +0530
+++ b/source/test/pixelharness.cpp Wed Feb 27 12:35:02 2019 +0530
@@ -2270,6 +2270,32 @@
     return true;
 }

+bool PixelHarness::check_ssimDist(ssimDistortion_t ref, ssimDistortion_t
opt)
+{
+    uint32_t srcStride[5] = { 4, 8, 16, 32, 64 };
+    intptr_t dstStride[5] = { 4, 8, 16, 32, 64 };
+    int shift = X265_DEPTH - 8;
+    uint64_t opt_dest1 = 0, ref_dest1 = 0, opt_dest2 = 0, ref_dest2 = 0;
+    int j = 0;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int index = i % TEST_CASES;
+        int k1 = rand() % 5, k2 = rand() % 5;
+        ref(pixel_test_buff[index] + j, srcStride[k1],
pixel_test_buff[index + 10] + j, dstStride[k2], &ref_dest1, shift,
&ref_dest2);
+        opt(pixel_test_buff[index] + j, srcStride[k1],
pixel_test_buff[index + 10] + j, dstStride[k2], &opt_dest1, shift,
&opt_dest2);
+
+        if (opt_dest1 != ref_dest1 && opt_dest2 != ref_dest2)
+        {
+            return false;
+        }
+
+        reportfail()
+        j += INCR;
+    }
+    return true;
+}
+
 bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const
EncoderPrimitives& opt)
 {
     if (opt.pu[part].satd)
@@ -2607,6 +2633,15 @@
             }
         }

+        if (opt.cu[i].ssimDist)
+        {
+            if (!check_ssimDist(ref.cu[i].ssimDist, opt.cu[i].ssimDist))
+            {
+                printf("\nssimDist[%dx%d] failed!\n", 4 << i, 4 << i);
+                return false;
+            }
+        }
+
         if (i < BLOCK_64x64)
         {
             /* TU only primitives */
@@ -3093,6 +3128,7 @@
             return false;
         }
     }
+
     return true;
 }

@@ -3392,6 +3428,14 @@
             HEADER("psy_cost_pp[%dx%d]", 4 << i, 4 << i);
             REPORT_SPEEDUP(opt.cu[i].psy_cost_pp, ref.cu[i].psy_cost_pp,
pbuf1, STRIDE, pbuf2, STRIDE);
         }
+
+        if (opt.cu[i].ssimDist)
+        {
+            uint64_t dst1 = 0, dst2 = 0;
+            int shift = X265_DEPTH - 8;
+            printf("ssimDist[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.cu[i].ssimDist, ref.cu[i].ssimDist,
pixel_test_buff[0], 32, pixel_test_buff[5], 64, &dst1, shift, &dst2);
+        }
     }

     if (opt.weight_pp)
diff -r cb3e172a5f51 -r d12a4caf7963 source/test/pixelharness.h
--- a/source/test/pixelharness.h Tue Feb 19 20:20:35 2019 +0530
+++ b/source/test/pixelharness.h Wed Feb 27 12:35:02 2019 +0530
@@ -136,6 +136,7 @@
     bool check_pelFilterChroma_H(pelFilterChroma_t ref, pelFilterChroma_t
opt);
     bool check_integral_initv(integralv_t ref, integralv_t opt);
     bool check_integral_inith(integralh_t ref, integralh_t opt);
+    bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);

 public:


-- 
*Regards,*
*Akil R*
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190305/dcf33b90/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: ssimDistortion_avx2.patch
Type: application/octet-stream
Size: 23892 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190305/dcf33b90/attachment-0001.obj>


More information about the x265-devel mailing list