[x265] [PATCH 114 of 307] x86: Aligned routine implementation for blockfill_s primitive

Sat Apr 7 04:31:52 CEST 2018

# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1507112703 -19800
#      Wed Oct 04 15:55:03 2017 +0530
# Node ID 14c93ddbd598128b43a96ff21221e2dbb189d275
# Parent  ddc227597df3335e30cec9a50489f3fd87391274
x86: Aligned routine implementation for blockfill_s primitive

diff -r ddc227597df3 -r 14c93ddbd598 source/common/pixel.cpp

--- a/source/common/pixel.cpp	Wed Oct 04 14:03:32 2017 +0530
+++ b/source/common/pixel.cpp	Wed Oct 04 15:55:03 2017 +0530
@@ -1000,6 +1000,7 @@
     p.cu[BLOCK_ ## W ## x ## H].copy_ps       = blockcopy_ps_c<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].copy_ss       = blockcopy_ss_c<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].blockfill_s   = blockfill_s_c<W>;  \
+    p.cu[BLOCK_ ## W ## x ## H].blockfill_s_aligned   = blockfill_s_c<W>;  \
     p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl<W>; \
     p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr<W>; \
     p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl = cpy1Dto2D_shl<W>; \
diff -r ddc227597df3 -r 14c93ddbd598 source/common/primitives.h
--- a/source/common/primitives.h	Wed Oct 04 14:03:32 2017 +0530
+++ b/source/common/primitives.h	Wed Oct 04 15:55:03 2017 +0530
@@ -271,6 +271,7 @@
         pixel_sub_ps_t  sub_ps;
         pixel_add_ps_t  add_ps;
         blockfill_s_t   blockfill_s;   // block fill, for DC transforms
+        blockfill_s_t   blockfill_s_aligned;   // block fill, for DC transforms
         copy_cnt_t      copy_cnt;      // copy coeff while counting non-zero
         count_nonzero_t count_nonzero;
         cpy2Dto1D_shl_t cpy2Dto1D_shl;
diff -r ddc227597df3 -r 14c93ddbd598 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Oct 04 14:03:32 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Oct 04 15:55:03 2017 +0530
@@ -2569,6 +2569,9 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg_aligned = PFX(addAvg_aligned_32x48_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg_aligned = PFX(addAvg_aligned_32x64_avx512);
 
+        p.cu[BLOCK_16x16].blockfill_s_aligned = PFX(blockfill_s_16x16_avx2);
+        p.cu[BLOCK_32x32].blockfill_s_aligned = PFX(blockfill_s_aligned_32x32_avx512);
+
     }
 }
 #else // if HIGH_BIT_DEPTH
@@ -4294,6 +4297,8 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512);
 
         p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx512);
+        p.cu[BLOCK_16x16].blockfill_s_aligned = PFX(blockfill_s_16x16_avx2);
+        p.cu[BLOCK_32x32].blockfill_s_aligned = PFX(blockfill_s_aligned_32x32_avx512);
 
         p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512);
         p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
diff -r ddc227597df3 -r 14c93ddbd598 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Wed Oct 04 14:03:32 2017 +0530
+++ b/source/common/x86/blockcopy8.asm	Wed Oct 04 15:55:03 2017 +0530
@@ -2574,6 +2574,24 @@
 %endrep
 RET
 
+;--------------------------------------------------------------------
+; void blockfill_s_aligned_32x32(int16_t* dst, intptr_t dstride, int16_t val)
+;--------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal blockfill_s_aligned_32x32, 3, 4, 1
+add          r1, r1
+lea          r3, [3 * r1]
+movd         xm0, r2d
+vpbroadcastw m0, xm0
+
+%rep 8
+mova       [r0], m0
+mova       [r0 + r1], m0
+mova       [r0 + 2 * r1], m0
+mova       [r0 + r3], m0
+lea        r0, [r0 + 4 * r1]
+%endrep
+RET
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 ;-----------------------------------------------------------------------------
diff -r ddc227597df3 -r 14c93ddbd598 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Wed Oct 04 14:03:32 2017 +0530
+++ b/source/common/x86/blockcopy8.h	Wed Oct 04 15:55:03 2017 +0530
@@ -51,6 +51,7 @@
 FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val);
 FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val);
 FUNCDEF_TU(void, blockfill_s, avx512, int16_t* dst, intptr_t dstride, int16_t val);
+FUNCDEF_TU(void, blockfill_s_aligned, avx512, int16_t* dst, intptr_t dstride, int16_t val);
 
 FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
 FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
diff -r ddc227597df3 -r 14c93ddbd598 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Wed Oct 04 14:03:32 2017 +0530
+++ b/source/test/pixelharness.cpp	Wed Oct 04 15:55:03 2017 +0530
@@ -645,8 +645,33 @@
 
 bool PixelHarness::check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt)
 {
-    ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
-    ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
+    ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
+    ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    intptr_t stride = 64;
+    for (int i = 0; i < ITERS; i++)
+    {
+        int16_t value = (rand() % SHORT_MAX) + 1;
+
+        checked(opt, opt_dest, stride, value);
+        ref(ref_dest, stride, value);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
+            return false;
+
+        reportfail();
+    }
+
+    return true;
+}
+
+bool PixelHarness::check_blockfill_s_aligned(blockfill_s_t ref, blockfill_s_t opt)
+{
+    ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
+    ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
 
     memset(ref_dest, 0xCD, sizeof(ref_dest));
     memset(opt_dest, 0xCD, sizeof(opt_dest));
@@ -2388,6 +2413,14 @@
             }
         }
 
+        if (opt.cu[i].blockfill_s_aligned)
+        {
+            if (!check_blockfill_s_aligned(ref.cu[i].blockfill_s_aligned, opt.cu[i].blockfill_s_aligned))
+            {
+                printf("blockfill_s_aligned[%dx%d]: failed!\n", 4 << i, 4 << i);
+                return false;
+            }
+        }
         if (opt.cu[i].var)
         {
             if (!check_pixel_var(ref.cu[i].var, opt.cu[i].var))
@@ -3081,6 +3114,12 @@
             REPORT_SPEEDUP(opt.cu[i].blockfill_s, ref.cu[i].blockfill_s, sbuf1, 64, SHORT_MAX);
         }
 
+        if (opt.cu[i].blockfill_s_aligned)
+        {
+            HEADER("blkfill[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.cu[i].blockfill_s_aligned, ref.cu[i].blockfill_s_aligned, sbuf1, 64, SHORT_MAX);
+        }
+
         if (opt.cu[i].transpose)
         {
             HEADER("transpose[%dx%d]", 4 << i, 4 << i);
diff -r ddc227597df3 -r 14c93ddbd598 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Wed Oct 04 14:03:32 2017 +0530
+++ b/source/test/pixelharness.h	Wed Oct 04 15:55:03 2017 +0530
@@ -85,6 +85,7 @@
     bool check_scale2D_pp(scale2D_t ref, scale2D_t opt);
     bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
     bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);
+    bool check_blockfill_s_aligned(blockfill_s_t ref, blockfill_s_t opt);
     bool check_calresidual(calcresidual_t ref, calcresidual_t opt);
     bool check_transpose(transpose_t ref, transpose_t opt);
     bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);