[x265] [PATCH 271 of 307] Aligned routine implementation of ssd_s primitive

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:29 CEST 2018


# HG changeset patch
# User Jayashree
# Date 1513585304 -19800
#      Mon Dec 18 13:51:44 2017 +0530
# Node ID fd28f49cb7b30aab97105a59ec841812af205cb9
# Parent  265fd2e1e49587837ebed4e7efcc38a0f6e79346
Aligned routine implementation of ssd_s primitive

diff -r 265fd2e1e495 -r fd28f49cb7b3 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Wed Dec 13 16:51:23 2017 +0530
+++ b/source/common/pixel.cpp	Mon Dec 18 13:51:44 2017 +0530
@@ -1009,7 +1009,8 @@
     p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr<W>; \
     p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp   = psyCost_pp<BLOCK_ ## W ## x ## H>; \
     p.cu[BLOCK_ ## W ## x ## H].transpose     = transpose<W>; \
-    p.cu[BLOCK_ ## W ## x ## H].ssd_s         = pixel_ssd_s_c<W>; \
+    p.cu[BLOCK_ ## W ## x ## H].ssd_s[NONALIGNED]         = pixel_ssd_s_c<W>; \
+    p.cu[BLOCK_ ## W ## x ## H].ssd_s[ALIGNED] = pixel_ssd_s_c<W>; \
     p.cu[BLOCK_ ## W ## x ## H].var           = pixel_var<W>; \
     p.cu[BLOCK_ ## W ## x ## H].calcresidual[NONALIGNED]  = getResidual<W>; \
     p.cu[BLOCK_ ## W ## x ## H].calcresidual[ALIGNED]     = getResidual<W>; \
diff -r 265fd2e1e495 -r fd28f49cb7b3 source/common/primitives.h
--- a/source/common/primitives.h	Wed Dec 13 16:51:23 2017 +0530
+++ b/source/common/primitives.h	Mon Dec 18 13:51:44 2017 +0530
@@ -290,9 +290,8 @@
         pixel_sse_t     sse_pp;        // Sum of Square Error (pixel, pixel) fenc alignment not assumed
         pixel_sse_ss_t  sse_ss;        // Sum of Square Error (short, short) fenc alignment not assumed
         pixelcmp_t      psy_cost_pp;   // difference in AC energy between two pixel blocks
-        pixel_ssd_s_t   ssd_s;         // Sum of Square Error (residual coeff to self)
+        pixel_ssd_s_t   ssd_s[NUM_ALIGNMENT_TYPES];         // Sum of Square Error (residual coeff to self)
         pixelcmp_t      sa8d;          // Sum of Transformed Differences (8x8 Hadamard), uses satd for 4x4 intra TU
-
         transpose_t     transpose;     // transpose pixel block; for use with intra all-angs
         intra_allangs_t intra_pred_allangs;
         intra_filter_t  intra_filter;
diff -r 265fd2e1e495 -r fd28f49cb7b3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Dec 13 16:51:23 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Dec 18 13:51:44 2017 +0530
@@ -1015,10 +1015,10 @@
         ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
         ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
 #if X86_64
-        p.cu[BLOCK_4x4].ssd_s = PFX(pixel_ssd_s_4_sse2);
-        p.cu[BLOCK_8x8].ssd_s = PFX(pixel_ssd_s_8_sse2);
-        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_sse2);
-        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_sse2);
+        ASSIGN2(p.cu[BLOCK_4x4].ssd_s,pixel_ssd_s_4_sse2 );
+        ASSIGN2(p.cu[BLOCK_8x8].ssd_s,pixel_ssd_s_8_sse2);
+        ASSIGN2(p.cu[BLOCK_16x16].ssd_s,pixel_ssd_s_16_sse2);
+        ASSIGN2(p.cu[BLOCK_32x32].ssd_s,pixel_ssd_s_32_sse2 );
 #endif
         ALL_LUMA_TU_S(calcresidual[ALIGNED], getResidual, sse2);
         ALL_LUMA_TU_S(calcresidual[NONALIGNED], getResidual, sse2);
@@ -1681,9 +1681,8 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx2);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx2);
 
-        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
-        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
-
+        ASSIGN2( p.cu[BLOCK_16x16].ssd_s,pixel_ssd_s_16_avx2);
+        ASSIGN2( p.cu[BLOCK_32x32].ssd_s,pixel_ssd_s_32_avx2);
         p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_16x16_avx2);
         p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx2);
         p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx2);
@@ -2450,9 +2449,8 @@
         p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x32_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x48_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x64_avx512);
-
-        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
-
+        p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32_avx512);
+        p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_aligned_32_avx512);
         p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx512);
         p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx512);
         p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
@@ -3235,10 +3233,9 @@
         ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, sse2);
         ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2);
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
-        ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
+        ALL_LUMA_TU_S(ssd_s[NONALIGNED], pixel_ssd_s_, sse2);
         ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse2);
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
-
         p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
         p.cu[BLOCK_4x4].intra_pred[3] = PFX(intra_pred_ang4_3_sse2);
         p.cu[BLOCK_4x4].intra_pred[4] = PFX(intra_pred_ang4_4_sse2);
@@ -3822,9 +3819,8 @@
         p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
 
-        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
-        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
-
+        ASSIGN2(p.cu[BLOCK_16x16].ssd_s, pixel_ssd_s_16_avx2);
+        ASSIGN2(p.cu[BLOCK_32x32].ssd_s, pixel_ssd_s_32_avx2);
         p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2);
         p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2);
         p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2);
@@ -4880,9 +4876,10 @@
         p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
         p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
         p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx512);
-        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512);
-        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx512);
-
+        p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32_avx512);
+        p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32_avx512);
+        p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16_avx512);
+        p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_aligned_16_avx512);
         p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
         p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
         p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512);
diff -r 265fd2e1e495 -r fd28f49cb7b3 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Wed Dec 13 16:51:23 2017 +0530
+++ b/source/common/x86/pixel.h	Mon Dec 18 13:51:44 2017 +0530
@@ -54,7 +54,9 @@
     FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
     FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
     FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
     FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
     FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
     FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); \
     FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
diff -r 265fd2e1e495 -r fd28f49cb7b3 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm	Wed Dec 13 16:51:23 2017 +0530
+++ b/source/common/x86/ssd-a.asm	Mon Dec 18 13:51:44 2017 +0530
@@ -3489,3 +3489,116 @@
 ;-----------------------------------------------------------------------------
 ; ssd_s avx512 code end
 ;-----------------------------------------------------------------------------
+;-----------------------------------------------------------------------------
+;ALigned version of macro
+;-----------------------------------------------------------------------------
+%macro PROCESS_SSD_S_16x8_ALIGNED_AVX512 0
+    mova             ym1,   [r0]
+    vinserti32x8     m1,    [r0 + r1],     1
+    mova             ym2,   [r0 + 2 * r1]
+    vinserti32x8     m2,    [r0 + r3],     1
+    lea              r0,    [r0 + 4 * r1]
+    mova             ym3,   [r0]
+    vinserti32x8     m3,    [r0 + r1],     1
+    mova             ym4,   [r0 + 2 * r1]
+    vinserti32x8     m4,    [r0 + r3],     1
+    pmaddwd m1, m1
+    pmaddwd m2, m2
+    pmaddwd m3, m3
+    pmaddwd m4, m4
+    paddd   m1, m2
+    paddd   m3, m4
+    paddd   m1, m3
+    paddd   m0, m1
+%endmacro
+;---------------------------------------------------------------------------------
+;int pixel_ssd_s_aligned( int16_t *ref, intptr_t i_stride )
+;-----------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_ZMM avx512
+
+INIT_ZMM avx512
+cglobal pixel_ssd_s_aligned_16, 2,4,5
+    add     r1, r1
+    lea     r3, [r1 * 3]
+    pxor    m0, m0
+
+    PROCESS_SSD_S_16x8_ALIGNED_AVX512
+    lea     r0, [r0 + 4 * r1]
+    PROCESS_SSD_S_16x8_ALIGNED_AVX512
+
+    ; calculate sum and return
+    HADDD   m0, m1
+    movd    eax, xm0
+    RET
+%endif
+;---------------------------------------------------------------------------------------------
+; aligned implementation for 32
+;---------------------------------------------------------------------------------------------
+%macro PROCESS_SSD_S_32x8_ALIGNED_AVX512 0
+    mova    m1, [r0]
+    mova    m2, [r0 + r1]
+    mova    m3, [r0 + 2 * r1]
+    mova    m4, [r0 + r3]
+
+    pmaddwd m1, m1
+    pmaddwd m2, m2
+    pmaddwd m3, m3
+    pmaddwd m4, m4
+    paddd   m1, m2
+    paddd   m3, m4
+    paddd   m1, m3
+    paddd   m0, m1
+
+    lea     r0, [r0 + 4 * r1]
+
+    mova    m1, [r0]
+    mova    m2, [r0 + r1]
+    mova    m3, [r0 + 2 * r1]
+    mova    m4, [r0 + r3]
+
+    pmaddwd m1, m1
+    pmaddwd m2, m2
+    pmaddwd m3, m3
+    pmaddwd m4, m4
+    paddd   m1, m2
+    paddd   m3, m4
+    paddd   m1, m3
+    paddd   m0, m1
+%endmacro
+
+%if ARCH_X86_64
+INIT_ZMM avx512
+cglobal pixel_ssd_s_aligned_32, 2,4,5
+    add     r1, r1
+    lea     r3, [r1 * 3]
+    pxor    m0, m0
+
+    PROCESS_SSD_S_32x8_AVX512
+    lea     r0, [r0 + 4 * r1]
+    PROCESS_SSD_S_32x8_ALIGNED_AVX512
+    lea     r0, [r0 + 4 * r1]
+    PROCESS_SSD_S_32x8_ALIGNED_AVX512
+    lea     r0, [r0 + 4 * r1]
+    PROCESS_SSD_S_32x8_ALIGNED_AVX512
+
+    ; calculate sum and return
+%if BIT_DEPTH >= 10
+    mova            m1, m0
+    pxor            m2, m2
+    punpckldq       m0, m2
+    punpckhdq       m1, m2
+    paddq           m0, m1
+    vextracti32x8   ym2, m0, 1
+    paddq           ym0, ym2
+    vextracti32x4   xm2, m0, 1
+    paddq           xm2, xm0
+    movhlps         xm1, xm2
+    paddq           xm2, xm1
+    movq            rax, xm2
+%else
+    HADDD   m0, m1
+    movd    eax, xm0
+%endif
+    RET
+%endif
\ No newline at end of file
diff -r 265fd2e1e495 -r fd28f49cb7b3 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Wed Dec 13 16:51:23 2017 +0530
+++ b/source/test/pixelharness.cpp	Mon Dec 18 13:51:44 2017 +0530
@@ -267,10 +267,27 @@
         reportfail();
         j += INCR;
     }
-
     return true;
 }
-
+bool PixelHarness::check_ssd_s_aligned(pixel_ssd_s_t ref, pixel_ssd_s_t opt)
+{
+    int j = 0;
+    for (int i = 0; i < ITERS; i++)
+    {
+               // NOTE: stride must be multiple of 16, because minimum block is 4x4
+        int stride = (STRIDE + (rand() % STRIDE)) & ~15;
+        sse_t cres = ref(sbuf1 + j, stride);
+        sse_t vres = (sse_t)checked(opt, sbuf1 + j, (intptr_t)stride);
+
+        if (cres != vres)
+            return false;
+
+        reportfail();
+        j += INCR+32;
+    }
+
+    return true;
+}
 bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt)
 {
     ALIGN_VAR_16(pixel, ref_dest[64 * (64 + 1)]);
@@ -2619,16 +2636,22 @@
                     return false;
                 }
             }
-
-            if (opt.cu[i].ssd_s)
+            if (opt.cu[i].ssd_s[NONALIGNED])
             {
-                if (!check_ssd_s(ref.cu[i].ssd_s, opt.cu[i].ssd_s))
+                if (!check_ssd_s(ref.cu[i].ssd_s[NONALIGNED], opt.cu[i].ssd_s[NONALIGNED]))
                 {
                     printf("ssd_s[%dx%d]: failed!\n", 4 << i, 4 << i);
                     return false;
                 }
             }
-
+            if (opt.cu[i].ssd_s[ALIGNED])
+            {
+                if (!check_ssd_s_aligned(ref.cu[i].ssd_s[ALIGNED], opt.cu[i].ssd_s[ALIGNED]))
+                {
+                    printf("ssd_s_aligned[%dx%d]: failed!\n", 4 << i, 4 << i);
+                    return false;
+                }
+            }
             if (opt.cu[i].copy_cnt)
             {
                 if (!check_copy_cnt_t(ref.cu[i].copy_cnt, opt.cu[i].copy_cnt))
@@ -3278,13 +3301,17 @@
             measurePartition(part, ref, opt);
         }
     }
-
     for (int i = 0; i < NUM_CU_SIZES; i++)
     {
-        if ((i <= BLOCK_32x32) && opt.cu[i].ssd_s)
+        if ((i <= BLOCK_32x32) && opt.cu[i].ssd_s[NONALIGNED])
         {
             HEADER("ssd_s[%dx%d]", 4 << i, 4 << i);
-            REPORT_SPEEDUP(opt.cu[i].ssd_s, ref.cu[i].ssd_s, sbuf1, STRIDE);
+            REPORT_SPEEDUP(opt.cu[i].ssd_s[NONALIGNED], ref.cu[i].ssd_s[NONALIGNED], sbuf1, STRIDE);
+        }
+        if ((i <= BLOCK_32x32) && opt.cu[i].ssd_s[ALIGNED])
+        {
+            HEADER("ssd_s[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.cu[i].ssd_s[ALIGNED], ref.cu[i].ssd_s[ALIGNED], sbuf1, STRIDE);
         }
         if (opt.cu[i].sa8d)
         {
diff -r 265fd2e1e495 -r fd28f49cb7b3 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Wed Dec 13 16:51:23 2017 +0530
+++ b/source/test/pixelharness.h	Mon Dec 18 13:51:44 2017 +0530
@@ -87,6 +87,7 @@
     bool check_scale1D_pp_aligned(scale1D_t ref, scale1D_t opt);
     bool check_scale2D_pp(scale2D_t ref, scale2D_t opt);
     bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
+    bool check_ssd_s_aligned(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
     bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);
     bool check_blockfill_s_aligned(blockfill_s_t ref, blockfill_s_t opt);
     bool check_calresidual(calcresidual_t ref, calcresidual_t opt);


More information about the x265-devel mailing list