[x265] [PATCH 118 of 307] x86: Aligned routine implementation for scale1D_128to64 primitive

Sat Apr 7 04:31:56 CEST 2018

# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1507116779 -19800
#      Wed Oct 04 17:02:59 2017 +0530
# Node ID ba20a08181382a2fb18a0d1aff7637d66fa41ac7
# Parent  1748c9a5c9b16c380f926cd5d07a69c4f13a6fab
x86: Aligned routine implementation for scale1D_128to64 primitive

diff -r 1748c9a5c9b1 -r ba20a0818138 source/common/pixel.cpp

--- a/source/common/pixel.cpp	Thu Oct 05 11:26:37 2017 +0530
+++ b/source/common/pixel.cpp	Wed Oct 04 17:02:59 2017 +0530
@@ -1264,6 +1264,7 @@
     p.weight_sp = weight_sp_c;
 
     p.scale1D_128to64 = scale1D_128to64;
+    p.scale1D_128to64_aligned = scale1D_128to64;
     p.scale2D_64to32 = scale2D_64to32;
     p.frameInitLowres = frame_init_lowres_core;
     p.ssim_4x4x2_core = ssim_4x4x2_core;
diff -r 1748c9a5c9b1 -r ba20a0818138 source/common/primitives.h
--- a/source/common/primitives.h	Thu Oct 05 11:26:37 2017 +0530
+++ b/source/common/primitives.h	Wed Oct 04 17:02:59 2017 +0530
@@ -312,6 +312,7 @@
     dequant_normal_t      dequant_normal;
     denoiseDct_t          denoiseDct;
     scale1D_t             scale1D_128to64;
+    scale1D_t             scale1D_128to64_aligned;
     scale2D_t             scale2D_64to32;
 
     ssim_4x4x2_core_t     ssim_4x4x2_core;
diff -r 1748c9a5c9b1 -r ba20a0818138 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Oct 05 11:26:37 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Oct 04 17:02:59 2017 +0530
@@ -4219,6 +4219,7 @@
         p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx512);
 
         p.scale1D_128to64 = PFX(scale1D_128to64_avx512);
+        p.scale1D_128to64_aligned = PFX(scale1D_128to64_aligned_avx512);
 
         p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx512);
         p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512);
diff -r 1748c9a5c9b1 -r ba20a0818138 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Thu Oct 05 11:26:37 2017 +0530
+++ b/source/common/x86/pixel-util.h	Wed Oct 04 17:02:59 2017 +0530
@@ -37,6 +37,7 @@
     void PFX(weight_pp_ ## cpu(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)); \
     void PFX(weight_sp_ ## cpu(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)); \
     void PFX(scale1D_128to64_ ## cpu(pixel*, const pixel*)); \
+    void PFX(scale1D_128to64_aligned_ ## cpu(pixel*, const pixel*)); \
     void PFX(scale2D_64to32_ ## cpu(pixel*, const pixel*, intptr_t)); \
     uint32_t PFX(costCoeffRemain_ ## cpu(uint16_t *absCoeff, int numNonZero, int idx)); \
     uint32_t PFX(costC1C2Flag_sse2(uint16_t *absCoeff, intptr_t numNonZero, uint8_t *baseCtxMod, intptr_t ctxOffset)); \
diff -r 1748c9a5c9b1 -r ba20a0818138 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Thu Oct 05 11:26:37 2017 +0530
+++ b/source/common/x86/pixel-util8.asm	Wed Oct 04 17:02:59 2017 +0530
@@ -4599,6 +4599,37 @@
     vshufi64x2      m2, m2, q3120
     movu            [r0 + 64], m2
     RET
+
+INIT_ZMM avx512
+cglobal scale1D_128to64_aligned, 2, 2, 6
+    pxor            m4, m4
+    vbroadcasti32x8 m5, [pb_1]
+
+    ;Top pixel
+    mova            m0, [r1]
+    mova            m1, [r1 + 64]
+    mova            m2, [r1 + 128]
+    mova            m3, [r1 + 192]
+
+    pmaddubsw       m0, m0, m5
+    pavgw           m0, m4
+    pmaddubsw       m1, m1, m5
+    pavgw           m1, m4
+    packuswb        m0, m1
+    vpermq          m0, m0, q3120
+    vshufi64x2      m0, m0, q3120
+    mova            [r0], m0
+
+    ;Left pixel
+    pmaddubsw       m2, m2, m5
+    pavgw           m2, m4
+    pmaddubsw       m3, m3, m5
+    pavgw           m3, m4
+    packuswb        m2, m3
+    vpermq          m2, m2, q3120
+    vshufi64x2      m2, m2, q3120
+    mova            [r0 + 64], m2
+    RET
 %endif
 
 ;-----------------------------------------------------------------
diff -r 1748c9a5c9b1 -r ba20a0818138 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Thu Oct 05 11:26:37 2017 +0530
+++ b/source/test/pixelharness.cpp	Wed Oct 04 17:02:59 2017 +0530
@@ -749,8 +749,8 @@
 
 bool PixelHarness::check_scale1D_pp(scale1D_t ref, scale1D_t opt)
 {
-    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
-    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+    ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
+    ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
 
     memset(ref_dest, 0, sizeof(ref_dest));
     memset(opt_dest, 0, sizeof(opt_dest));
@@ -772,6 +772,31 @@
     return true;
 }
 
+bool PixelHarness::check_scale1D_pp_aligned(scale1D_t ref, scale1D_t opt)
+{
+    ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
+    ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0, sizeof(ref_dest));
+    memset(opt_dest, 0, sizeof(opt_dest));
+
+    int j = 0;
+    for (int i = 0; i < ITERS; i++)
+    {
+        int index = i % TEST_CASES;
+        checked(opt, opt_dest, pixel_test_buff[index] + j);
+        ref(ref_dest, pixel_test_buff[index] + j);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+            return false;
+
+        reportfail();
+        j += INCR * 2;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_scale2D_pp(scale2D_t ref, scale2D_t opt)
 {
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -2587,6 +2612,15 @@
         }
     }
 
+    if (opt.scale1D_128to64_aligned)
+    {
+        if (!check_scale1D_pp_aligned(ref.scale1D_128to64_aligned, opt.scale1D_128to64_aligned))
+        {
+            printf("scale1D_128to64_aligned failed!\n");
+            return false;
+        }
+    }
+
     if (opt.scale2D_64to32)
     {
         if (!check_scale2D_pp(ref.scale2D_64to32, opt.scale2D_64to32))
@@ -3234,6 +3268,12 @@
         REPORT_SPEEDUP(opt.scale1D_128to64, ref.scale1D_128to64, pbuf2, pbuf1);
     }
 
+    if (opt.scale1D_128to64_aligned)
+    {
+        HEADER0("scale1D_128to64_aligned");
+        REPORT_SPEEDUP(opt.scale1D_128to64_aligned, ref.scale1D_128to64_aligned, pbuf2, pbuf1);
+    }
+
     if (opt.scale2D_64to32)
     {
         HEADER0("scale2D_64to32");
diff -r 1748c9a5c9b1 -r ba20a0818138 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Thu Oct 05 11:26:37 2017 +0530
+++ b/source/test/pixelharness.h	Wed Oct 04 17:02:59 2017 +0530
@@ -82,6 +82,7 @@
     bool check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt);
     bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt);
     bool check_scale1D_pp(scale1D_t ref, scale1D_t opt);
+    bool check_scale1D_pp_aligned(scale1D_t ref, scale1D_t opt);
     bool check_scale2D_pp(scale2D_t ref, scale2D_t opt);
     bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
     bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);