[x265] [PATCH 262 of 307] x86: Aligned routine implementation for cpy1Dto2D_shl primitive

Sat Apr 7 04:34:20 CEST 2018

# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1513143310 -19800
#      Wed Dec 13 11:05:10 2017 +0530
# Node ID 458b708e6d17aafb49a5fd369b2e9540d0268726
# Parent  ab5b1becd807647d5264381c1fb74750c20fdfae
x86: Aligned routine implementation for cpy1Dto2D_shl primitive

1. cpy1Dto2D_shl optimization
2. Aligned code impementation
3. Linking with encoder

diff -r ab5b1becd807 -r 458b708e6d17 source/common/pixel.cpp

--- a/source/common/pixel.cpp	Fri Dec 08 14:00:59 2017 +0530
+++ b/source/common/pixel.cpp	Wed Dec 13 11:05:10 2017 +0530
@@ -1004,7 +1004,8 @@
     p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED]    = blockfill_s_c<W>;  \
     p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl<W>; \
     p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr<W>; \
-    p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl = cpy1Dto2D_shl<W>; \
+    p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl<W>; \
+    p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl<W>; \
     p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr<W>; \
     p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp   = psyCost_pp<BLOCK_ ## W ## x ## H>; \
     p.cu[BLOCK_ ## W ## x ## H].transpose     = transpose<W>; \
diff -r ab5b1becd807 -r 458b708e6d17 source/common/primitives.h
--- a/source/common/primitives.h	Fri Dec 08 14:00:59 2017 +0530
+++ b/source/common/primitives.h	Wed Dec 13 11:05:10 2017 +0530
@@ -280,9 +280,8 @@
         count_nonzero_t count_nonzero;
         cpy2Dto1D_shl_t cpy2Dto1D_shl;
         cpy2Dto1D_shr_t cpy2Dto1D_shr;
-        cpy1Dto2D_shl_t cpy1Dto2D_shl;
+        cpy1Dto2D_shl_t cpy1Dto2D_shl[NUM_ALIGNMENT_TYPES];
         cpy1Dto2D_shr_t cpy1Dto2D_shr;
-
         copy_sp_t       copy_sp;
         copy_ps_t       copy_ps;
         copy_ss_t       copy_ss;
diff -r ab5b1becd807 -r 458b708e6d17 source/common/quant.cpp
--- a/source/common/quant.cpp	Fri Dec 08 14:00:59 2017 +0530
+++ b/source/common/quant.cpp	Wed Dec 13 11:05:10 2017 +0530
@@ -560,13 +560,11 @@
                             uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
 {
     const uint32_t sizeIdx = log2TrSize - 2;
-
     if (cu.m_tqBypass[0])
     {
-        primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, coeff, resiStride, 0);
+        primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, coeff, resiStride, 0);
         return;
     }
-
     // Values need to pass as input parameter in dequant
     int rem = m_qpParam[ttype].rem;
     int per = m_qpParam[ttype].per;
@@ -595,7 +593,7 @@
         if (transformShift > 0)
             primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
         else
-            primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, m_resiDctCoeff, resiStride, -transformShift);
+            primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, m_resiDctCoeff, resiStride, -transformShift);
 #endif
     }
     else
diff -r ab5b1becd807 -r 458b708e6d17 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Dec 08 14:00:59 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Dec 13 11:05:10 2017 +0530
@@ -989,7 +989,7 @@
         ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);
         ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, sse2);
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
-        ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
+        ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2);
         ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
         ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
 #if X86_64
@@ -1692,11 +1692,9 @@
 
         ASSIGN2(p.cu[BLOCK_16x16].blockfill_s, blockfill_s_16x16_avx2);
         ASSIGN2(p.cu[BLOCK_32x32].blockfill_s, blockfill_s_32x32_avx2);
-
         ALL_LUMA_TU(count_nonzero, count_nonzero, avx2);
-        ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2);
+        ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, avx2);
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
-
         p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2);
         p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2);
         p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2);
@@ -2526,10 +2524,10 @@
         p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512);
         p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512);
         p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512);
-
         p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
         p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
-        p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
+        p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512);
+        p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512);
         p.weight_pp = PFX(weight_pp_avx512);
         p.weight_sp = PFX(weight_sp_avx512);
         p.dequant_normal = PFX(dequant_normal_avx512);
@@ -3196,10 +3194,9 @@
         ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);
         ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
         ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
-        ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
+        ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2);
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
         ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
-
         ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse2);
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
 
@@ -3794,12 +3791,9 @@
         p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2);
         p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2);
         p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2);
-
         ASSIGN2(p.cu[BLOCK_16x16].blockfill_s, blockfill_s_16x16_avx2);
-
-        ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2);
+        ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, avx2);
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
-
         p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8_avx2);
         p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx2);
         p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx2);
@@ -4861,7 +4855,8 @@
         p.cu[BLOCK_32x32].calcresidual[ALIGNED] = PFX(getResidual_aligned32_avx512);
         p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
         p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
-        p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
+        p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512);
+        p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512);
 
         p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
         p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
diff -r ab5b1becd807 -r 458b708e6d17 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Fri Dec 08 14:00:59 2017 +0530
+++ b/source/common/x86/blockcopy8.asm	Wed Dec 13 11:05:10 2017 +0530
@@ -5599,17 +5599,58 @@
     add         r2d, r2d
     movd        xm0, r3d
     lea         r3, [3 * r2]
-
+%rep 3
     PROCESS_CPY1Dto2D_SHL_32x8_AVX512
     add         r1, 4 * mmsize
     lea         r0, [r0 + r2 * 4]
+%endrep
     PROCESS_CPY1Dto2D_SHL_32x8_AVX512
+    RET
+
+%macro PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512 0
+    mova        m1,            [r1 + 0 * mmsize]
+    mova        m2,            [r1 + 1 * mmsize]
+    mova        m3,            [r1 + 2 * mmsize]
+    mova        m4,            [r1 + 3 * mmsize]
+    psllw       m1,            xm0
+    psllw       m2,            xm0
+    psllw       m3,            xm0
+    psllw       m4,            xm0
+    mova        [r0],          m1
+    mova        [r0 + r2],     m2
+    mova        [r0 + 2 * r2], m3
+    mova        [r0 + r3],     m4
+
+    add         r1,            4 * mmsize
+    lea         r0,            [r0 + r2 * 4]
+
+    mova        m1,            [r1 + 0 * mmsize]
+    mova        m2,            [r1 + 1 * mmsize]
+    mova        m3,            [r1 + 2 * mmsize]
+    mova        m4,            [r1 + 3 * mmsize]
+    psllw       m1,            xm0
+    psllw       m2,            xm0
+    psllw       m3,            xm0
+    psllw       m4,            xm0
+    mova        [r0],          m1
+    mova        [r0 + r2],     m2
+    mova        [r0 + 2 * r2], m3
+    mova        [r0 + r3],     m4
+%endmacro
+;--------------------------------------------------------------------------------------
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal cpy1Dto2D_shl_aligned_32, 4, 4, 5
+    add         r2d, r2d
+    movd        xm0, r3d
+    lea         r3, [3 * r2]
+%rep 3
+    PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512
     add         r1, 4 * mmsize
     lea         r0, [r0 + r2 * 4]
-    PROCESS_CPY1Dto2D_SHL_32x8_AVX512
-    add         r1, 4 * mmsize
-    lea         r0, [r0 + r2 * 4]
-    PROCESS_CPY1Dto2D_SHL_32x8_AVX512
+%endrep
+    PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512
     RET
 ;--------------------------------------------------------------------------------------
 ; copy_cnt avx512 code end
diff -r ab5b1becd807 -r 458b708e6d17 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Fri Dec 08 14:00:59 2017 +0530
+++ b/source/common/x86/blockcopy8.h	Wed Dec 13 11:05:10 2017 +0530
@@ -38,7 +38,7 @@
 FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
-
+FUNCDEF_TU_S(void, cpy1Dto2D_shl_aligned, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
diff -r ab5b1becd807 -r 458b708e6d17 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Fri Dec 08 14:00:59 2017 +0530
+++ b/source/test/pixelharness.cpp	Wed Dec 13 11:05:10 2017 +0530
@@ -469,12 +469,10 @@
 
     return true;
 }
-
 bool PixelHarness::check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
 {
-    ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
-    ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
-
+    ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
+    ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
     memset(ref_dest, 0xCD, sizeof(ref_dest));
     memset(opt_dest, 0xCD, sizeof(opt_dest));
 
@@ -497,6 +495,33 @@
 
     return true;
 }
+bool PixelHarness::check_cpy1Dto2D_shl_aligned_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
+{
+    ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
+    ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    int j = 0;
+    intptr_t stride = STRIDE;
+    for (int i = 0; i < ITERS; i++)
+    {
+        int shift = (rand() % 7 + 1);
+
+        int index = i % TEST_CASES;
+        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+        ref(ref_dest, short_test_buff[index] + j, stride, shift);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
+            return false;
+
+        reportfail();
+        j += INCR + 32;
+    }
+
+    return true;
+}
 
 bool PixelHarness::check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt)
 {
@@ -2597,15 +2622,22 @@
                     return false;
                 }
             }
-
-            if (opt.cu[i].cpy1Dto2D_shl)
+            if (opt.cu[i].cpy1Dto2D_shl[NONALIGNED])
             {
-                if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl, opt.cu[i].cpy1Dto2D_shl))
+                if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl[NONALIGNED], opt.cu[i].cpy1Dto2D_shl[NONALIGNED]))
                 {
                     printf("cpy1Dto2D_shl[%dx%d] failed!\n", 4 << i, 4 << i);
                     return false;
                 }
             }
+            if (opt.cu[i].cpy1Dto2D_shl[ALIGNED])
+            {
+                if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl[ALIGNED], opt.cu[i].cpy1Dto2D_shl[ALIGNED]))
+                {
+                    printf("cpy1Dto2D_shl_aligned[%dx%d] failed!\n", 4 << i, 4 << i);
+                    return false;
+                }
+            }
 
             if (opt.cu[i].cpy1Dto2D_shr)
             {
@@ -3270,13 +3302,17 @@
             HEADER("cpy2Dto1D_shr[%dx%d]", 4 << i, 4 << i);
             REPORT_SPEEDUP(opt.cu[i].cpy2Dto1D_shr, ref.cu[i].cpy2Dto1D_shr, sbuf1, sbuf2, STRIDE, 3);
         }
-
-        if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl)
+        if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl[NONALIGNED])
         {
             HEADER("cpy1Dto2D_shl[%dx%d]", 4 << i, 4 << i);
-            REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl, ref.cu[i].cpy1Dto2D_shl, sbuf1, sbuf2, STRIDE, 64);
+            REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl[NONALIGNED], ref.cu[i].cpy1Dto2D_shl[NONALIGNED], sbuf1, sbuf2, STRIDE, 64);
         }
 
+        if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl[ALIGNED])
+        {
+            HEADER("cpy1Dto2D_shl_aligned[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl[ALIGNED], ref.cu[i].cpy1Dto2D_shl[ALIGNED], sbuf1, sbuf2, STRIDE, 64);
+        }
         if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shr)
         {
             HEADER("cpy1Dto2D_shr[%dx%d]", 4 << i, 4 << i);
diff -r ab5b1becd807 -r 458b708e6d17 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Fri Dec 08 14:00:59 2017 +0530
+++ b/source/test/pixelharness.h	Wed Dec 13 11:05:10 2017 +0530
@@ -97,6 +97,7 @@
     bool check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt);
     bool check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt);
     bool check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
+    bool check_cpy1Dto2D_shl_aligned_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
     bool check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt);
     bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
     bool check_pixel_var(var_t ref, var_t opt);