[x265] [PATCH 2 of 3] asm: cvt16to32_cnt[4x4] for TSkip

Sat Aug 2 02:56:49 CEST 2014

# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1406940987 25200
# Node ID a25d83e9037bb62015d5d62f18f8182620a44d8c
# Parent  c1d8dda4f1f17c679655156c3cd55805ad9eca4d
asm: cvt16to32_cnt[4x4] for TSkip

diff -r c1d8dda4f1f1 -r a25d83e9037b source/common/primitives.cpp

--- a/source/common/primitives.cpp	Fri Aug 01 17:56:11 2014 -0700
+++ b/source/common/primitives.cpp	Fri Aug 01 17:56:27 2014 -0700
@@ -57,6 +57,7 @@
 void Setup_C_IPFilterPrimitives(EncoderPrimitives &p);
 void Setup_C_IPredPrimitives(EncoderPrimitives &p);
 void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p);
+void Setup_C_TSkipPrimitives(EncoderPrimitives &p);
 
 void Setup_C_Primitives(EncoderPrimitives &p)
 {
@@ -65,6 +66,7 @@
     Setup_C_IPFilterPrimitives(p);   // ipfilter.cpp
     Setup_C_IPredPrimitives(p);      // intrapred.cpp
     Setup_C_LoopFilterPrimitives(p); // loopfilter.cpp
+    Setup_C_TSkipPrimitives(p);      // quant.cpp
 }
 
 void Setup_Alias_Primitives(EncoderPrimitives &p)
diff -r c1d8dda4f1f1 -r a25d83e9037b source/common/primitives.h
--- a/source/common/primitives.h	Fri Aug 01 17:56:11 2014 -0700
+++ b/source/common/primitives.h	Fri Aug 01 17:56:27 2014 -0700
@@ -150,6 +150,7 @@
 
 typedef void (*cvt16to32_shl_t)(int32_t *dst, int16_t *src, intptr_t, int, int);
 typedef void (*cvt32to16_shr_t)(int16_t *dst, int32_t *src, intptr_t, int, int);
+typedef uint32_t (*cvt16to32_cnt_t)(coeff_t* coeff, int16_t* residual, intptr_t stride);
 
 typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
 typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
@@ -218,6 +219,7 @@
     blockcpy_ps_t   blockcpy_ps;                     // block copy pixel from short
     cvt16to32_shl_t cvt16to32_shl;
     cvt32to16_shr_t cvt32to16_shr;
+    cvt16to32_cnt_t cvt16to32_cnt[NUM_SQUARE_BLOCKS - 1];
 
     copy_pp_t       luma_copy_pp[NUM_LUMA_PARTITIONS];
     copy_sp_t       luma_copy_sp[NUM_LUMA_PARTITIONS];
diff -r c1d8dda4f1f1 -r a25d83e9037b source/common/quant.cpp
--- a/source/common/quant.cpp	Fri Aug 01 17:56:11 2014 -0700
+++ b/source/common/quant.cpp	Fri Aug 01 17:56:27 2014 -0700
@@ -365,17 +365,8 @@
     int trSize = 1 << log2TrSize;
     if (cu->getCUTransquantBypass(absPartIdx))
     {
-        uint32_t numSig = 0;
-        for (int k = 0; k < trSize; k++)
-        {
-            for (int j = 0; j < trSize; j++)
-            {
-                coeff[k * trSize + j] = ((int16_t)residual[k * stride + j]);
-                numSig += (residual[k * stride + j] != 0);
-            }
-        }
-
-        return numSig;
+        X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
+        return primitives.cvt16to32_cnt[log2TrSize - 2](coeff, residual, stride);
     }
 
     X265_CHECK((cu->m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n");
@@ -1166,3 +1157,33 @@
 
     return (sigRight | sigLower) & 1;
 }
+
+
+namespace x265 {
+// x265 private namespace
+
+template<int trSize>
+uint32_t conv16to32_count(coeff_t* coeff, int16_t* residual, intptr_t stride)
+{
+    uint32_t numSig = 0;
+    for (int k = 0; k < trSize; k++)
+    {
+        for (int j = 0; j < trSize; j++)
+        {
+            coeff[k * trSize + j] = ((int16_t)residual[k * stride + j]);
+            numSig += (residual[k * stride + j] != 0);
+        }
+    }
+
+    return numSig;
+}
+
+void Setup_C_TSkipPrimitives(EncoderPrimitives& p)
+{
+    p.cvt16to32_cnt[BLOCK_4x4] = conv16to32_count<4>;
+    p.cvt16to32_cnt[BLOCK_8x8] = conv16to32_count<8>;
+    p.cvt16to32_cnt[BLOCK_16x16] = conv16to32_count<16>;
+    p.cvt16to32_cnt[BLOCK_32x32] = conv16to32_count<32>;
+}
+
+}
diff -r c1d8dda4f1f1 -r a25d83e9037b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Aug 01 17:56:11 2014 -0700
+++ b/source/common/x86/asm-primitives.cpp	Fri Aug 01 17:56:27 2014 -0700
@@ -1229,6 +1229,9 @@
         CHROMA_ADDAVG(_sse4);
         p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
 
+        // TODO: check POPCNT flag!
+        p.cvt16to32_cnt[BLOCK_4x4] = x265_cvt16to32_cnt_4_sse4;
+
         HEVC_SATD(sse4);
         SA8D_INTER_FROM_BLOCK(sse4);
 
@@ -1327,6 +1330,7 @@
         p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_avx2;
         p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx2;
         p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_avx2;
+        p.cvt16to32_cnt[BLOCK_4x4] = x265_cvt16to32_cnt_4_avx2;
     }
 #endif // if HIGH_BIT_DEPTH
 }
diff -r c1d8dda4f1f1 -r a25d83e9037b source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Fri Aug 01 17:56:11 2014 -0700
+++ b/source/common/x86/blockcopy8.asm	Fri Aug 01 17:56:27 2014 -0700
@@ -29,6 +29,8 @@
 
 tab_Vm:    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
 
+cextern pw_4
+
 SECTION .text
 
 ;-----------------------------------------------------------------------------
@@ -3097,6 +3099,81 @@
     add             r1,       r2
     dec             r5d
     jnz             .loop_row
-
     RET
 
+
+;--------------------------------------------------------------------------------------
+; void cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal cvt16to32_cnt_4, 3,3,5
+    add         r2d, r2d
+    pxor        m4, m4
+
+    ; row 0 & 1
+    movh        m0, [r1]
+    movhps      m0, [r1 + r2]
+    mova        m2, m0
+    pmovsxwd    m1, m0
+    punpckhwd   m0, m0
+    psrad       m0, 16
+    movu        [r0 + 0 * mmsize], m1
+    movu        [r0 + 1 * mmsize], m0
+
+    ; row 2 & 3
+    movh        m0, [r1 + r2 * 2]
+    lea         r2, [r2 * 3]
+    movhps      m0, [r1 + r2]
+    packsswb    m2, m0
+    pcmpeqb     m2, m4
+    pmovsxwd    m1, m0
+    punpckhwd   m0, m0
+    psrad       m0, 16
+    movu        [r0 + 2 * mmsize], m1
+    movu        [r0 + 3 * mmsize], m0
+
+    ; get count
+    ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
+%if 1
+    pmovmskb    eax, m2
+    not         ax
+    popcnt      ax, ax
+%else
+    movhlps     m3, m2
+    paddw       m2, m3
+
+    mova        m3, [pw_4]
+    paddw       m3, m2
+    psadbw      m3, m4
+
+    movd        eax, m3
+%endif
+    RET
+
+
+INIT_YMM avx2
+cglobal cvt16to32_cnt_4, 3,3,5
+    add         r2d, r2d
+    pxor        m4, m4
+
+    ; row 0 & 1
+    movq        xm0, [r1]
+    movhps      xm0, [r1 + r2]
+    pmovsxwd    m1, xm0
+    movu        [r0 + 0 * mmsize], m1
+
+    ; row 2 & 3
+    movq        xm1, [r1 + r2 * 2]
+    lea         r2, [r2 * 3]
+    movhps      xm1, [r1 + r2]
+    pmovsxwd    m2, xm1
+    movu        [r0 + 1 * mmsize], m2
+
+    packsswb    xm0, xm1
+    pcmpeqb     xm0, xm4
+
+    ; get count
+    pmovmskb    eax, xm0
+    not         ax
+    popcnt      ax, ax
+    RET
diff -r c1d8dda4f1f1 -r a25d83e9037b source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Fri Aug 01 17:56:11 2014 -0700
+++ b/source/common/x86/blockcopy8.h	Fri Aug 01 17:56:27 2014 -0700
@@ -26,6 +26,11 @@
 
 void x265_cvt32to16_shr_sse2(int16_t * dst, int *src, intptr_t, int, int);
 void x265_cvt16to32_shl_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
+uint32_t x265_cvt16to32_cnt_4_sse4(int32_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_8_sse4(int32_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_16_sse4(int32_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_32_sse4(int32_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_4_avx2(int32_t * dst, int16_t * src, intptr_t);
 
 #define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
     void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \
diff -r c1d8dda4f1f1 -r a25d83e9037b source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Fri Aug 01 17:56:11 2014 -0700
+++ b/source/test/pixelharness.cpp	Fri Aug 01 17:56:27 2014 -0700
@@ -603,6 +603,35 @@
     return true;
 }
 
+bool PixelHarness::check_cvt16to32_cnt_t(cvt16to32_cnt_t ref, cvt16to32_cnt_t opt)
+{
+    ALIGN_VAR_16(int32_t, ref_dest[64 * 64]);
+    ALIGN_VAR_16(int32_t, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    int j = 0;
+    intptr_t stride = STRIDE;
+    for (int i = 0; i < ITERS; i++)
+    {
+#ifdef _DEBUG
+        memset(ref_dest, 0xCD, sizeof(ref_dest));
+        memset(opt_dest, 0xCD, sizeof(opt_dest));
+#endif
+        int opt_cnt = checked(opt, opt_dest, sbuf1 + j, stride);
+        int ref_cnt = ref(ref_dest, sbuf1 + j, stride);
+
+        if ((ref_cnt != opt_cnt) || memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt)
 {
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -1368,6 +1397,15 @@
                 return false;
             }
         }
+
+        if ((i < BLOCK_64x64) && opt.cvt16to32_cnt[i])
+        {
+            if (!check_cvt16to32_cnt_t(ref.cvt16to32_cnt[i], opt.cvt16to32_cnt[i]))
+            {
+                printf("cvt16to32_cnt[%dx%d] failed!\n", 4 << i, 4 << i);
+                return false;
+            }
+        }
     }
 
     if (opt.cvt32to16_shr)
@@ -1726,6 +1764,12 @@
             HEADER("var[%dx%d]", 4 << i, 4 << i);
             REPORT_SPEEDUP(opt.var[i], ref.var[i], pbuf1, STRIDE);
         }
+
+        if ((i < BLOCK_64x64) && opt.cvt16to32_cnt[i])
+        {
+            HEADER("cvt16to32_cnt[%dx%d]", 4 << i, 4 << i);
+            REPORT_SPEEDUP(opt.cvt16to32_cnt[i], ref.cvt16to32_cnt[i], ibuf1, sbuf2, STRIDE);
+        }
     }
 
     if (opt.cvt32to16_shr)
diff -r c1d8dda4f1f1 -r a25d83e9037b source/test/pixelharness.h
--- a/source/test/pixelharness.h	Fri Aug 01 17:56:11 2014 -0700
+++ b/source/test/pixelharness.h	Fri Aug 01 17:56:27 2014 -0700
@@ -63,6 +63,7 @@
     bool check_downscale_t(downscale_t ref, downscale_t opt);
     bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt);
     bool check_cvt16to32_shl_t(cvt16to32_shl_t ref, cvt16to32_shl_t opt);
+    bool check_cvt16to32_cnt_t(cvt16to32_cnt_t ref, cvt16to32_cnt_t opt);
     bool check_pixel_var(var_t ref, var_t opt);
     bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
     bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);