[x265] [PATCH] added copy_shr primitive

Tue Sep 2 16:13:08 CEST 2014

# HG changeset patch
# User Praveen Tiwari
# Date 1409652070 -19800
# Node ID 2667a0e3afdc2b95ff73c962b3e25366162d8e8d
# Parent  16de8fd2837c853c974f83f9aba9e8ef09c2fe2b
added copy_shr primitive

diff -r 16de8fd2837c -r 2667a0e3afdc source/common/pixel.cpp

--- a/source/common/pixel.cpp	Tue Sep 02 14:38:41 2014 +0530
+++ b/source/common/pixel.cpp	Tue Sep 02 15:31:10 2014 +0530
@@ -470,6 +470,22 @@
     }
 }
 
+void copy_shr(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size)
+{
+    int round = 1 << (shift - 1);
+
+    for (int i = 0; i < size; i++)
+    {
+        for (int j = 0; j < size; j++)
+        {
+            dst[j] = (int16_t)((src[j] + round) >> shift);
+        }
+
+        src += size;
+        dst += stride;
+    }
+}
+
 template<int size>
 void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
 {
@@ -1213,6 +1229,8 @@
     p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>;
     p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>;
 
+    p.copy_shr = copy_shr;
+
     p.sa8d[BLOCK_4x4]   = satd_4x4;
     p.sa8d[BLOCK_8x8]   = sa8d_8x8;
     p.sa8d[BLOCK_16x16] = sa8d_16x16;
diff -r 16de8fd2837c -r 2667a0e3afdc source/common/primitives.h
--- a/source/common/primitives.h	Tue Sep 02 14:38:41 2014 +0530
+++ b/source/common/primitives.h	Tue Sep 02 15:31:10 2014 +0530
@@ -154,6 +154,7 @@
 typedef void (*cvt32to16_shr_t)(int16_t *dst, int32_t *src, intptr_t, int, int);
 typedef void (*cvt32to16_shl_t)(int16_t *dst, int32_t *src, intptr_t, int);
 typedef uint32_t (*copy_cnt_t)(int16_t* coeff, int16_t* residual, intptr_t stride);
+typedef void (*copy_shr_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
 
 typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
 typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
@@ -227,6 +228,7 @@
     cvt32to16_shr_t cvt32to16_shr;
     cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1];
     copy_cnt_t      copy_cnt[NUM_SQUARE_BLOCKS - 1];
+    copy_shr_t      copy_shr;
 
     copy_pp_t       luma_copy_pp[NUM_LUMA_PARTITIONS];
     copy_sp_t       luma_copy_sp[NUM_LUMA_PARTITIONS];
diff -r 16de8fd2837c -r 2667a0e3afdc source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Sep 02 14:38:41 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Sep 02 15:31:10 2014 +0530
@@ -1670,6 +1670,7 @@
         INTRA_ANG_SSE4(sse4);
 
         p.dct[DCT_8x8] = x265_dct8_sse4;
+        p.copy_shr = x265_copy_shr_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r 16de8fd2837c -r 2667a0e3afdc source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Tue Sep 02 14:38:41 2014 +0530
+++ b/source/common/x86/blockcopy8.asm	Tue Sep 02 15:31:10 2014 +0530
@@ -4400,3 +4400,79 @@
     psadbw      xm0, xm4
     movd        eax, xm0
     RET
+
+;-----------------------------------------------------------------------------
+; void copy_shr(short *dst, short *src, intptr_t stride, int shift, int size)
+;-----------------------------------------------------------------------------
+
+INIT_XMM sse4
+cglobal copy_shr, 4, 7, 4, dst, src, stride
+%define rnd     m2
+%define shift   m1
+
+    ; make shift
+    mov         r5d, r3m
+    movd        shift, r5d
+
+    ; make round
+    dec         r5
+    xor         r6, r6
+    bts         r6, r5
+
+    movd        rnd, r6d
+    pshufd      rnd, rnd, 0
+
+    ; register alloc
+    ; r0 - dst
+    ; r1 - src
+    ; r2 - stride * 2 (short*)
+    ; r3 - lx
+    ; r4 - size
+    ; r5 - ly
+    ; r6 - diff
+    add         r2d, r2d
+
+    mov         r4d, r4m
+    mov         r5, r4 ; size
+    mov         r6, r2 ; stride
+    sub         r6, r4
+    add         r6, r6
+
+    shr         r5, 1
+.loop_row:
+
+    mov         r3, r4
+    shr         r3, 2
+.loop_col:
+    ; row 0
+    movh        m3, [r1]
+    pmovsxwd    m0, m3
+    paddd       m0, rnd
+    psrad       m0, shift
+    packssdw    m0, m0
+    movh        [r0], m0
+
+    ; row 1
+    movh        m3, [r1 + r4 * 2]
+    pmovsxwd    m0, m3
+    paddd       m0, rnd
+    psrad       m0, shift
+    packssdw    m0, m0
+    movh        [r0 + r2], m0
+
+    ; move col pointer
+    add         r1, 8
+    add         r0, 8
+
+    dec         r3
+    jg          .loop_col
+
+    ; update pointer
+    lea         r1, [r1 + r4 * 2]
+    add         r0, r6
+
+    ; end of loop_row
+    dec         r5
+    jg         .loop_row
+
+    RET
diff -r 16de8fd2837c -r 2667a0e3afdc source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Tue Sep 02 14:38:41 2014 +0530
+++ b/source/common/x86/blockcopy8.h	Tue Sep 02 15:31:10 2014 +0530
@@ -38,6 +38,7 @@
 void x265_cvt16to32_shr_8_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
 void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
 void x265_cvt16to32_shr_32_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
+void x265_copy_shr_sse4(int16_t * dst, int16_t *src, intptr_t, int, int);
 uint32_t x265_copy_cnt_4_sse4(int16_t * dst, int16_t * src, intptr_t);
 uint32_t x265_copy_cnt_8_sse4(int16_t * dst, int16_t * src, intptr_t);
 uint32_t x265_copy_cnt_16_sse4(int16_t * dst, int16_t * src, intptr_t);
diff -r 16de8fd2837c -r 2667a0e3afdc source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Sep 02 14:38:41 2014 +0530
+++ b/source/test/pixelharness.cpp	Tue Sep 02 15:31:10 2014 +0530
@@ -606,6 +606,34 @@
     return true;
 }
 
+bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt)
+{
+    ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
+    ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    int j = 0;
+    intptr_t stride = STRIDE;
+    for (int i = 0; i < ITERS; i++)
+    {
+        int shift = (rand() % 7 + 1);
+
+        int index = i % TEST_CASES;
+        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
+        ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt)
 {
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -1541,6 +1569,15 @@
         }
     }
 
+    if (opt.copy_shr)
+    {
+        if (!check_copy_shr_t(ref.copy_shr, opt.copy_shr))
+        {
+            printf("copy_shr failed!\n");
+            return false;
+        }
+    }
+
     return true;
 }
 
@@ -1875,4 +1912,11 @@
         HEADER0("planecopy_cp");
         REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1, 64, 64, 64, 2);
     }
+
+    if (opt.copy_shr)
+    {
+        HEADER0("copy_shr");
+        REPORT_SPEEDUP(opt.copy_shr, ref.copy_shr, sbuf1, sbuf2, 64, 5, 64);
+    }
+
 }
diff -r 16de8fd2837c -r 2667a0e3afdc source/test/pixelharness.h
--- a/source/test/pixelharness.h	Tue Sep 02 14:38:41 2014 +0530
+++ b/source/test/pixelharness.h	Tue Sep 02 15:31:10 2014 +0530
@@ -89,6 +89,7 @@
     bool check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt);
     bool check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt);
     bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
+    bool check_copy_shr_t(copy_shr_t ref, copy_shr_t opt);
     bool check_pixel_var(var_t ref, var_t opt);
     bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
     bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);