[x265] [PATCH x265] Add AVX2 assembly code for normFactor primitive.

Tue Mar 5 05:34:22 CET 2019

# HG changeset patch
# User Akil Ayyappan<akil at multicorewareinc.com>
# Date 1551693998 -19800
#      Mon Mar 04 15:36:38 2019 +0530
# Node ID 19f27e0c8a6f8250af5e2f7c7984dc3b57bea7e4
# Parent  d12a4caf7963fd47d646040689ad5f02754ad879
x86: normFactor primitive

This patch adds AVX2 assembly for this primitive.

|---------|-----------|-----------------|-----------------|
|  Size   |Performance|AVX2 clock cycles|CPP clock cycles |
|---------|-----------|-----------------|-----------------|
| [8x8]   |   7.65x   |   312.90        |   2394.83       |
| [16x16] |   8.42x   |   1157.14       |   9741.56       |
| [32x32] |   9.56x   |   3942.18       |   37692.20      |
| [64x64] |   8.96x   |   15388.24      |   137889.28     |
|---------|-----------|-----------------|-----------------|

diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/pixel.cpp

--- a/source/common/pixel.cpp Wed Feb 27 12:35:02 2019 +0530
+++ b/source/common/pixel.cpp Mon Mar 04 15:36:38 2019 +0530
@@ -959,6 +959,19 @@
     }
 }

+static void normFact_c(const pixel* src, uint32_t blockSize, int shift,
uint64_t *z_k)
+{
+    *z_k = 0;
+    for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
+    {
+        for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
+        {
+            uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
+            *z_k += temp * temp;
+        }
+    }
+}
+
 #if HIGH_BIT_DEPTH
 static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int
height, uint64_t *outsum,
                                const pixel minPix, const pixel maxPix)
@@ -1314,5 +1327,10 @@
     p.cu[BLOCK_16x16].ssimDist = ssimDist_c<4>;
     p.cu[BLOCK_32x32].ssimDist = ssimDist_c<5>;
     p.cu[BLOCK_64x64].ssimDist = ssimDist_c<6>;
+
+    p.cu[BLOCK_8x8].normFact = normFact_c;
+    p.cu[BLOCK_16x16].normFact = normFact_c;
+    p.cu[BLOCK_32x32].normFact = normFact_c;
+    p.cu[BLOCK_64x64].normFact = normFact_c;
 }
 }
diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/primitives.h
--- a/source/common/primitives.h Wed Feb 27 12:35:02 2019 +0530
+++ b/source/common/primitives.h Mon Mar 04 15:36:38 2019 +0530
@@ -228,6 +228,7 @@
 typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t
*costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t
blkPos);
 typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t
*m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t
*totalRdCost, int64_t *psyScale, uint32_t blkPos);
 typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const
pixel *recon,  intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t
*ac_k);
+typedef void(*normFactor_t)(const pixel *src, uint32_t blockSize, int
shift, uint64_t *z_k);
 /* Function pointers to optimized encoder primitives. Each pointer can
reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function
*/
 struct EncoderPrimitives
@@ -305,6 +306,7 @@
  psyRdoQuant_t1   psyRdoQuant_1p;
  psyRdoQuant_t2   psyRdoQuant_2p;
         ssimDistortion_t ssimDist;
+        normFactor_t     normFact;
     }
     cu[NUM_CU_SIZES];
     /* These remaining primitives work on either fixed block sizes or take
diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Feb 27 12:35:02 2019 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Mar 04 15:36:38 2019 +0530
@@ -2325,6 +2325,11 @@
         p.cu[BLOCK_32x32].ssimDist = PFX(ssimDist32_avx2);
         p.cu[BLOCK_64x64].ssimDist = PFX(ssimDist64_avx2);

+        p.cu[BLOCK_8x8].normFact = PFX(normFact8_avx2);
+        p.cu[BLOCK_16x16].normFact = PFX(normFact16_avx2);
+        p.cu[BLOCK_32x32].normFact = PFX(normFact32_avx2);
+        p.cu[BLOCK_64x64].normFact = PFX(normFact64_avx2);
+
         /* TODO: This kernel needs to be modified to work with
HIGH_BIT_DEPTH only
         p.planeClipAndMax = PFX(planeClipAndMax_avx2); */

@@ -4718,6 +4723,11 @@
         p.cu[BLOCK_32x32].ssimDist = PFX(ssimDist32_avx2);
         p.cu[BLOCK_64x64].ssimDist = PFX(ssimDist64_avx2);

+        p.cu[BLOCK_8x8].normFact = PFX(normFact8_avx2);
+        p.cu[BLOCK_16x16].normFact = PFX(normFact16_avx2);
+        p.cu[BLOCK_32x32].normFact = PFX(normFact32_avx2);
+        p.cu[BLOCK_64x64].normFact = PFX(normFact64_avx2);
+
     }
     if (cpuMask & X265_CPU_AVX512)
     {
diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Feb 27 12:35:02 2019 +0530
+++ b/source/common/x86/pixel-a.asm Mon Mar 04 15:36:38 2019 +0530
@@ -388,6 +388,16 @@
     vpaddq         m7,         m6
 %endmacro

+%macro NORM_FACT_COL 1
+    vpsrld         m1,          m0,        SSIMRD_SHIFT
+    vpmuldq        m2,          m1,        m1
+    vpsrldq        m1,          m1,        4
+    vpmuldq        m1,          m1,        m1
+
+    vpaddq         m1,          m2
+    vpaddq         m3,          m1
+%endmacro
+
 ; FIXME avoid the spilling of regs to hold 3*stride.
 ; for small blocks on x86_32, modify pixel pointer instead.

@@ -16303,3 +16313,266 @@
     movq           [r4],         xm4
     movq           [r6],         xm7
     RET
+
+
+;static void normFact_c(const pixel* src, uint32_t blockSize, int shift,
uint64_t *z_k)
+;{
+;    *z_k = 0;
+;    for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
+;    {
+;        for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
+;        {
+;            uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
+;            *z_k += temp * temp;
+;        }
+;    }
+;}
+;--------------------------------------------------------------------------------------
+; void normFact_c(const pixel* src, uint32_t blockSize, int shift,
uint64_t *z_k)
+;--------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal normFact8, 4, 5, 6
+    mov            r4d,       8
+    vpxor          m3,        m3                               ;z_k
+    vpxor          m5,        m5
+.row:
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,        [r0]                             ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,        [r0]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+%if HIGH_BIT_DEPTH
+    lea            r0,         [r0 + 2 * r1]
+%else
+    lea            r0,         [r0 + r1]
+%endif
+    dec            r4d
+    jnz           .row
+    vextracti128   xm4,         m3,        1
+    vpaddq         xm3,         xm4
+    punpckhqdq     xm2,         xm3,       xm5
+    paddq          xm3,         xm2
+    movq           [r3],        xm3
+    RET
+
+
+INIT_YMM avx2
+cglobal normFact16, 4, 5, 6
+    mov            r4d,         16
+    vpxor          m3,          m3                                ;z_k
+    vpxor          m5,          m5
+.row:
+;Col 1-8
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0]                              ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+;Col 9-16
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 16]                         ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 8]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+%if HIGH_BIT_DEPTH
+    lea            r0,         [r0 + 2 * r1]
+%else
+    lea            r0,         [r0 + r1]
+%endif
+    dec            r4d
+    jnz           .row
+    vextracti128   xm4,         m3,        1
+    vpaddq         xm3,         xm4
+    punpckhqdq     xm2,         xm3,       xm5
+    paddq          xm3,         xm2
+    movq           [r3],        xm3
+    RET
+
+
+INIT_YMM avx2
+cglobal normFact32, 4, 5, 6
+    mov            r4d,         32
+    vpxor          m3,          m3                              ;z_k
+    vpxor          m5,          m5
+.row:
+;Col 1-8
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,         [r0]                             ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,         [r0]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+;Col 9-16
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 16]                      ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 8]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+;Col 17-24
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 32]                      ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 16]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+;Col 25-32
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 48]                      ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 24]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+%if HIGH_BIT_DEPTH
+    lea            r0,          [r0 + 2 * r1]
+%else
+    lea            r0,          [r0 + r1]
+%endif
+    dec            r4d
+    jnz           .row
+    vextracti128   xm4,         m3,        1
+    vpaddq         xm3,         xm4
+    punpckhqdq     xm2,         xm3,       xm5
+    paddq          xm3,         xm2
+    movq           [r3],        xm3
+    RET
+
+
+INIT_YMM avx2
+cglobal normFact64, 4, 5, 6
+    mov            r4d,         64
+    vpxor          m3,          m3                             ;z_k
+    vpxor          m5,          m5
+.row:
+;Col 1-8
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0]                           ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+;Col 9-16
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 16]                      ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 8]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+;Col 17-24
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 32]                      ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 16]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+;Col 25-32
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 48]                      ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 24]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+;Col 33-40
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 64]                      ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 32]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+;Col 41-48
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 80]                      ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 40]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+;Col 49-56
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 96]                      ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 48]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+;Col 57-64
+%if HIGH_BIT_DEPTH
+    vpmovzxwd      m0,          [r0 + 112]                     ;src
+%elif BIT_DEPTH == 8
+    vpmovzxbd      m0,          [r0 + 56]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+
+    NORM_FACT_COL  m0
+
+%if HIGH_BIT_DEPTH
+    lea            r0,          [r0 + 2 * r1]
+%else
+    lea            r0,          [r0 + r1]
+%endif
+    dec            r4d
+    jnz           .row
+    vextracti128   xm4,         m3,        1
+    vpaddq         xm3,         xm4
+    punpckhqdq     xm2,         xm3,       xm5
+    paddq          xm3,         xm2
+    movq           [r3],        xm3
+    RET
diff -r d12a4caf7963 -r 19f27e0c8a6f source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Wed Feb 27 12:35:02 2019 +0530
+++ b/source/common/x86/pixel.h Mon Mar 04 15:36:38 2019 +0530
@@ -61,7 +61,8 @@
     FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
     FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t
sstride, const pixel* recon, intptr_t rstride); \
     FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t
sstride, const int16_t* recon, intptr_t rstride); \
-    FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t
fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int
shift, uint64_t *ac_k)
+    FUNCDEF_TU_S2(void, ssimDist, cpu, const pixel *fenc, uint32_t
fStride, const pixel *recon, intptr_t rstride, uint64_t *ssBlock, int
shift, uint64_t *ac_k); \
+    FUNCDEF_TU_S2(void, normFact, cpu, const pixel *src, uint32_t
blockSize, int shift, uint64_t *z_k)

 DECL_PIXELS(mmx);
 DECL_PIXELS(mmx2);
diff -r d12a4caf7963 -r 19f27e0c8a6f source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp Wed Feb 27 12:35:02 2019 +0530
+++ b/source/encoder/analysis.cpp Mon Mar 04 15:36:38 2019 +0530
@@ -3696,14 +3696,8 @@

     // 2. Calculate ac component
     uint64_t z_k = 0;
-    for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 1)
-    {
-        for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 1)
-        {
-            uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
-            z_k += temp * temp;
-        }
-    }
+    int block = (int)((log(blockSize) / log(2)) - 2);
+    primitives.cu[block].normFact(src, blockSize, shift, &z_k);

     // Remove the DC part
     z_k -= z_o;
diff -r d12a4caf7963 -r 19f27e0c8a6f source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Wed Feb 27 12:35:02 2019 +0530
+++ b/source/test/pixelharness.cpp Mon Mar 04 15:36:38 2019 +0530
@@ -2296,6 +2296,30 @@
     return true;
 }

+bool PixelHarness::check_normFact(normFactor_t ref, normFactor_t opt, int
block)
+{
+    int shift = X265_DEPTH - 8;
+    uint64_t opt_dest = 0, ref_dest = 0;
+    int j = 0;
+    int blockSize = 4 << block;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int index = i % TEST_CASES;
+        ref(pixel_test_buff[index] + j, blockSize, shift, &ref_dest);
+        opt(pixel_test_buff[index] + j, blockSize, shift, &opt_dest);
+
+        if (opt_dest != ref_dest)
+        {
+            return false;
+        }
+
+        reportfail()
+            j += INCR;
+    }
+    return true;
+}
+
 bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const
EncoderPrimitives& opt)
 {
     if (opt.pu[part].satd)
@@ -3129,6 +3153,18 @@
         }
     }

+    for (int i = BLOCK_8x8; i < NUM_CU_SIZES; i++)
+    {
+        if (opt.cu[i].normFact)
+        {
+            if (!check_normFact(ref.cu[i].normFact, opt.cu[i].normFact, i))
+            {
+                printf("\nnormFact[%dx%d] failed!\n", 4 << i, 4 << i);
+                return false;
+            }
+        }
+    }
+
     return true;
 }

@@ -3769,4 +3805,16 @@
             REPORT_SPEEDUP(opt.integral_inith[k], ref.integral_inith[k],
dst_buf, pbuf1, STRIDE);
         }
     }
+
+    for (int i = BLOCK_8x8; i < NUM_CU_SIZES; i++)
+    {
+        if (opt.cu[i].normFact)
+        {
+            uint64_t dst = 0;
+            int blockSize = 4 << i;
+            int shift = X265_DEPTH - 8;
+            printf("normFact[%dx%d]", blockSize, blockSize);
+            REPORT_SPEEDUP(opt.cu[i].normFact, ref.cu[i].normFact,
pixel_test_buff[0], blockSize, shift, &dst);
+        }
+    }
 }
diff -r d12a4caf7963 -r 19f27e0c8a6f source/test/pixelharness.h
--- a/source/test/pixelharness.h Wed Feb 27 12:35:02 2019 +0530
+++ b/source/test/pixelharness.h Mon Mar 04 15:36:38 2019 +0530
@@ -137,6 +137,7 @@
     bool check_integral_initv(integralv_t ref, integralv_t opt);
     bool check_integral_inith(integralh_t ref, integralh_t opt);
     bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);
+    bool check_normFact(normFactor_t ref, normFactor_t opt, int block);

 public:



-- 
*Regards,*
*Akil R*
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190305/add805be/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: normFactor_avx2.patch
Type: application/octet-stream
Size: 16207 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20190305/add805be/attachment-0001.obj>