[x265-commits] [x265] analysis: explicit locking for pmode and pme parameters

Fri Nov 21 07:43:27 CET 2014

details:   http://hg.videolan.org/x265/rev/2f8df4c972b9
branches:  
changeset: 8876:2f8df4c972b9
user:      Steve Borho <steve at borho.org>
date:      Thu Nov 20 20:35:50 2014 -0600
description:
analysis: explicit locking for pmode and pme parameters

We've found a repro case involving --no-wpp --pmode --pme --preset slower where
time starved worker threads get stuck in the findJob() routine and pushed off
the CPU in the mean time the master thread moves on to another CU. This caused
very hard to reproduce crashes.
Subject: [x265] fix copy16to16_shl

details:   http://hg.videolan.org/x265/rev/5a8da9cb52e8
branches:  
changeset: 8877:5a8da9cb52e8
user:      Satoshi Nakagawa <nakagawa424 at oki.com>
date:      Fri Nov 21 09:49:14 2014 +0900
description:
fix copy16to16_shl
Subject: [x265] dst4_c: optimization

details:   http://hg.videolan.org/x265/rev/49b66c57972d
branches:  
changeset: 8878:49b66c57972d
user:      Praveen Tiwari
date:      Fri Nov 21 09:12:33 2014 +0530
description:
dst4_c: optimization
Subject: [x265] idst4_c: optimization

details:   http://hg.videolan.org/x265/rev/8f373c20bc41
branches:  
changeset: 8879:8f373c20bc41
user:      Praveen Tiwari
date:      Fri Nov 21 09:27:00 2014 +0530
description:
idst4_c: optimization
Subject: [x265] dct4_c: C code optimization

details:   http://hg.videolan.org/x265/rev/d4376e113855
branches:  
changeset: 8880:d4376e113855
user:      Praveen Tiwari
date:      Fri Nov 21 09:43:00 2014 +0530
description:
dct4_c: C code optimization
Subject: [x265] dct8_c: optimization

details:   http://hg.videolan.org/x265/rev/d426e93e240c
branches:  
changeset: 8881:d426e93e240c
user:      Praveen Tiwari
date:      Fri Nov 21 10:51:30 2014 +0530
description:
dct8_c: optimization
Subject: [x265] dct16_c: optimization

details:   http://hg.videolan.org/x265/rev/7e94ea285179
branches:  
changeset: 8882:7e94ea285179
user:      Praveen Tiwari
date:      Fri Nov 21 10:59:04 2014 +0530
description:
dct16_c: optimization
Subject: [x265] dct32_c: optimization

details:   http://hg.videolan.org/x265/rev/a60dfb900169
branches:  
changeset: 8883:a60dfb900169
user:      Praveen Tiwari
date:      Fri Nov 21 11:06:20 2014 +0530
description:
dct32_c: optimization
Subject: [x265] idct4_c: optimization

details:   http://hg.videolan.org/x265/rev/69a472a77b49
branches:  
changeset: 8884:69a472a77b49
user:      Praveen Tiwari
date:      Fri Nov 21 11:22:19 2014 +0530
description:
idct4_c: optimization
Subject: [x265] idct8_c: optimization

details:   http://hg.videolan.org/x265/rev/f7d7c480b85d
branches:  
changeset: 8885:f7d7c480b85d
user:      Praveen Tiwari
date:      Fri Nov 21 11:28:16 2014 +0530
description:
idct8_c: optimization
Subject: [x265] idct16_c: optimization

details:   http://hg.videolan.org/x265/rev/388c893d3825
branches:  
changeset: 8886:388c893d3825
user:      Praveen Tiwari
date:      Fri Nov 21 11:31:27 2014 +0530
description:
idct16_c: optimization
Subject: [x265] idct32_c: C code optimization

details:   http://hg.videolan.org/x265/rev/346fccbba4de
branches:  
changeset: 8887:346fccbba4de
user:      Praveen Tiwari
date:      Fri Nov 21 12:03:54 2014 +0530
description:
idct32_c: C code optimization

diffstat:

 source/common/dct.cpp                |  129 ++--------------------------------
 source/common/pixel.cpp              |    3 +-
 source/common/x86/asm-primitives.cpp |    2 +-
 source/common/x86/blockcopy8.asm     |   26 +++---
 source/common/x86/blockcopy8.h       |    2 +-
 source/encoder/analysis.cpp          |   43 +++++++----
 source/encoder/analysis.h            |    1 +
 source/encoder/search.cpp            |   32 +++++---
 source/encoder/search.h              |    2 +-
 9 files changed, 74 insertions(+), 166 deletions(-)

diffs (truncated from 479 to 300 lines):

diff -r 2abf89f5c4f2 -r 346fccbba4de source/common/dct.cpp

--- a/source/common/dct.cpp	Thu Nov 20 14:31:04 2014 -0800
+++ b/source/common/dct.cpp	Fri Nov 21 12:03:54 2014 +0530
@@ -454,18 +454,7 @@ void dst4_c(const int16_t *src, int16_t 
     }
 
     fastForwardDst(block, coef, shift_1st);
-    fastForwardDst(coef, block, shift_2nd);
-
-#define N (4)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            dst[i * N + j] = block[i * N + j];
-        }
-    }
-
-#undef N
+    fastForwardDst(coef, dst, shift_2nd);
 }
 
 void dct4_c(const int16_t *src, int16_t *dst, intptr_t stride)
@@ -482,17 +471,7 @@ void dct4_c(const int16_t *src, int16_t 
     }
 
     partialButterfly4(block, coef, shift_1st, 4);
-    partialButterfly4(coef, block, shift_2nd, 4);
-#define N (4)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            dst[i * N + j] = block[i * N + j];
-        }
-    }
-
-#undef N
+    partialButterfly4(coef, dst, shift_2nd, 4);
 }
 
 void dct8_c(const int16_t *src, int16_t *dst, intptr_t stride)
@@ -509,18 +488,7 @@ void dct8_c(const int16_t *src, int16_t 
     }
 
     partialButterfly8(block, coef, shift_1st, 8);
-    partialButterfly8(coef, block, shift_2nd, 8);
-
-#define N (8)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            dst[i * N + j] = block[i * N + j];
-        }
-    }
-
-#undef N
+    partialButterfly8(coef, dst, shift_2nd, 8);
 }
 
 void dct16_c(const int16_t *src, int16_t *dst, intptr_t stride)
@@ -537,18 +505,7 @@ void dct16_c(const int16_t *src, int16_t
     }
 
     partialButterfly16(block, coef, shift_1st, 16);
-    partialButterfly16(coef, block, shift_2nd, 16);
-
-#define N (16)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            dst[i * N + j] = block[i * N + j];
-        }
-    }
-
-#undef N
+    partialButterfly16(coef, dst, shift_2nd, 16);
 }
 
 void dct32_c(const int16_t *src, int16_t *dst, intptr_t stride)
@@ -565,18 +522,7 @@ void dct32_c(const int16_t *src, int16_t
     }
 
     partialButterfly32(block, coef, shift_1st, 32);
-    partialButterfly32(coef, block, shift_2nd, 32);
-
-#define N (32)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            dst[i * N + j] = block[i * N + j];
-        }
-    }
-
-#undef N
+    partialButterfly32(coef, dst, shift_2nd, 32);
 }
 
 void idst4_c(const int16_t *src, int16_t *dst, intptr_t stride)
@@ -587,18 +533,7 @@ void idst4_c(const int16_t *src, int16_t
     ALIGN_VAR_32(int16_t, coef[4 * 4]);
     ALIGN_VAR_32(int16_t, block[4 * 4]);
 
-#define N (4)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            block[i * N + j] = (int16_t)src[i * N + j];
-        }
-    }
-
-#undef N
-
-    inversedst(block, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
+    inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
     inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output
 
     for (int i = 0; i < 4; i++)
@@ -615,18 +550,7 @@ void idct4_c(const int16_t *src, int16_t
     ALIGN_VAR_32(int16_t, coef[4 * 4]);
     ALIGN_VAR_32(int16_t, block[4 * 4]);
 
-#define N (4)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            block[i * N + j] = (int16_t)src[i * N + j];
-        }
-    }
-
-#undef N
-
-    partialButterflyInverse4(block, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
+    partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
     partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
 
     for (int i = 0; i < 4; i++)
@@ -643,18 +567,7 @@ void idct8_c(const int16_t *src, int16_t
     ALIGN_VAR_32(int16_t, coef[8 * 8]);
     ALIGN_VAR_32(int16_t, block[8 * 8]);
 
-#define N (8)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            block[i * N + j] = (int16_t)src[i * N + j];
-        }
-    }
-
-#undef N
-
-    partialButterflyInverse8(block, coef, shift_1st, 8);
+    partialButterflyInverse8(src, coef, shift_1st, 8);
     partialButterflyInverse8(coef, block, shift_2nd, 8);
     for (int i = 0; i < 8; i++)
     {
@@ -670,18 +583,7 @@ void idct16_c(const int16_t *src, int16_
     ALIGN_VAR_32(int16_t, coef[16 * 16]);
     ALIGN_VAR_32(int16_t, block[16 * 16]);
 
-#define N (16)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            block[i * N + j] = (int16_t)src[i * N + j];
-        }
-    }
-
-#undef N
-
-    partialButterflyInverse16(block, coef, shift_1st, 16);
+    partialButterflyInverse16(src, coef, shift_1st, 16);
     partialButterflyInverse16(coef, block, shift_2nd, 16);
     for (int i = 0; i < 16; i++)
     {
@@ -697,18 +599,7 @@ void idct32_c(const int16_t *src, int16_
     ALIGN_VAR_32(int16_t, coef[32 * 32]);
     ALIGN_VAR_32(int16_t, block[32 * 32]);
 
-#define N (32)
-    for (int i = 0; i < N; i++)
-    {
-        for (int j = 0; j < N; j++)
-        {
-            block[i * N + j] = (int16_t)src[i * N + j];
-        }
-    }
-
-#undef N
-
-    partialButterflyInverse32(block, coef, shift_1st, 32);
+    partialButterflyInverse32(src, coef, shift_1st, 32);
     partialButterflyInverse32(coef, block, shift_2nd, 32);
 
     for (int i = 0; i < 32; i++)
diff -r 2abf89f5c4f2 -r 346fccbba4de source/common/pixel.cpp
--- a/source/common/pixel.cpp	Thu Nov 20 14:31:04 2014 -0800
+++ b/source/common/pixel.cpp	Fri Nov 21 12:03:54 2014 +0530
@@ -493,11 +493,12 @@ void blockfil_s_c(int16_t* dst, intptr_t
 
 void copy16to16_shl(int16_t *dst, const int16_t *src, intptr_t stride, int shift, int size)
 {
+    X265_CHECK(!(size & 3), "invalid size\n");
     for (int i = 0; i < size; i++)
     {
         for (int j = 0; j < size; j++)
         {
-            dst[i * size + j] = (src[i * stride + j]) << shift;
+            dst[i * size + j] = src[i * stride + j] << shift;
         }
     }
 }
diff -r 2abf89f5c4f2 -r 346fccbba4de source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Nov 20 14:31:04 2014 -0800
+++ b/source/common/x86/asm-primitives.cpp	Fri Nov 21 12:03:54 2014 +0530
@@ -1548,6 +1548,7 @@ void Setup_Assembly_Primitives(EncoderPr
         p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
         SA8D_INTER_FROM_BLOCK(sse2);
 
+        p.cpy16to16_shl = x265_copy16to16_shl_sse2;
         p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
         p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
         p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
@@ -1614,7 +1615,6 @@ void Setup_Assembly_Primitives(EncoderPr
         LUMA_ADDAVG(_sse4);
         CHROMA_ADDAVG(_sse4);
         CHROMA_ADDAVG_422(_sse4);
-        p.cpy16to16_shl = x265_copy16to16_shl_sse4;
         p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
         p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
         p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
diff -r 2abf89f5c4f2 -r 346fccbba4de source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Thu Nov 20 14:31:04 2014 -0800
+++ b/source/common/x86/blockcopy8.asm	Fri Nov 21 12:03:54 2014 +0530
@@ -3672,37 +3672,35 @@ BLOCKCOPY_SS_W64_H4_avx 64, 64
 ;--------------------------------------------------------------------------------------
 ; void copy16to16_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
 ;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal copy16to16_shl, 5, 7, 2, dst, src, stride, shift, size
+INIT_XMM sse2
+cglobal copy16to16_shl, 5, 6, 2, dst, src, stride, shift, size
 %define shift       m1
 
     ; make shift
-    mov             r5d,      r3m
-    movd            shift,    r5d
+    movd            shift,    r3d
 
     ; register alloc
     ; r0 - dst
     ; r1 - src
     ; r2 - stride
-    ; r3 - shift
     ; r4 - size
 
     sub             r2d,      r4d
     add             r2d,      r2d
     mov             r5d,      r4d
-    shr             r4d,      3
+    shr             r4d,      2
 .loop_row:
-    mov             r6d,      r4d
+    mov             r3d,      r4d
 
 .loop_col:
-    movu            m0,       [r1]
+    movh            m0,       [r1]
     psllw           m0,       shift
-    movu            [r0],     m0
-
-    add             r1,       16
-    add             r0,       16
-
-    dec             r6d
+    movh            [r0],     m0
+
+    add             r1,       8
+    add             r0,       8
+
+    dec             r3d
     jnz             .loop_col
 
     add             r1,       r2
diff -r 2abf89f5c4f2 -r 346fccbba4de source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Thu Nov 20 14:31:04 2014 -0800
+++ b/source/common/x86/blockcopy8.h	Fri Nov 21 12:03:54 2014 +0530
@@ -32,7 +32,7 @@ void x265_cvt32to16_shl_4_avx2(int16_t* 
 void x265_cvt32to16_shl_8_avx2(int16_t* dst, const int* src, intptr_t, int);
 void x265_cvt32to16_shl_16_avx2(int16_t* dst, const int* src, intptr_t, int);
 void x265_cvt32to16_shl_32_avx2(int16_t* dst, const int* src, intptr_t, int);
-void x265_copy16to16_shl_sse4(int16_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
+void x265_copy16to16_shl_sse2(int16_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);