[x265-commits] [x265] analysis: explicit locking for pmode and pme parameters
Steve Borho
steve at borho.org
Fri Nov 21 07:43:27 CET 2014
details: http://hg.videolan.org/x265/rev/2f8df4c972b9
branches:
changeset: 8876:2f8df4c972b9
user: Steve Borho <steve at borho.org>
date: Thu Nov 20 20:35:50 2014 -0600
description:
analysis: explicit locking for pmode and pme parameters
We've found a repro case involving --no-wpp --pmode --pme --preset slower where
time starved worker threads get stuck in the findJob() routine and pushed off
the CPU in the mean time the master thread moves on to another CU. This caused
very hard to reproduce crashes.
Subject: [x265] fix copy16to16_shl
details: http://hg.videolan.org/x265/rev/5a8da9cb52e8
branches:
changeset: 8877:5a8da9cb52e8
user: Satoshi Nakagawa <nakagawa424 at oki.com>
date: Fri Nov 21 09:49:14 2014 +0900
description:
fix copy16to16_shl
Subject: [x265] dst4_c: optimization
details: http://hg.videolan.org/x265/rev/49b66c57972d
branches:
changeset: 8878:49b66c57972d
user: Praveen Tiwari
date: Fri Nov 21 09:12:33 2014 +0530
description:
dst4_c: optimization
Subject: [x265] idst4_c: optimization
details: http://hg.videolan.org/x265/rev/8f373c20bc41
branches:
changeset: 8879:8f373c20bc41
user: Praveen Tiwari
date: Fri Nov 21 09:27:00 2014 +0530
description:
idst4_c: optimization
Subject: [x265] dct4_c: C code optimization
details: http://hg.videolan.org/x265/rev/d4376e113855
branches:
changeset: 8880:d4376e113855
user: Praveen Tiwari
date: Fri Nov 21 09:43:00 2014 +0530
description:
dct4_c: C code optimization
Subject: [x265] dct8_c: optimization
details: http://hg.videolan.org/x265/rev/d426e93e240c
branches:
changeset: 8881:d426e93e240c
user: Praveen Tiwari
date: Fri Nov 21 10:51:30 2014 +0530
description:
dct8_c: optimization
Subject: [x265] dct16_c: optimization
details: http://hg.videolan.org/x265/rev/7e94ea285179
branches:
changeset: 8882:7e94ea285179
user: Praveen Tiwari
date: Fri Nov 21 10:59:04 2014 +0530
description:
dct16_c: optimization
Subject: [x265] dct32_c: optimization
details: http://hg.videolan.org/x265/rev/a60dfb900169
branches:
changeset: 8883:a60dfb900169
user: Praveen Tiwari
date: Fri Nov 21 11:06:20 2014 +0530
description:
dct32_c: optimization
Subject: [x265] idct4_c: optimization
details: http://hg.videolan.org/x265/rev/69a472a77b49
branches:
changeset: 8884:69a472a77b49
user: Praveen Tiwari
date: Fri Nov 21 11:22:19 2014 +0530
description:
idct4_c: optimization
Subject: [x265] idct8_c: optimization
details: http://hg.videolan.org/x265/rev/f7d7c480b85d
branches:
changeset: 8885:f7d7c480b85d
user: Praveen Tiwari
date: Fri Nov 21 11:28:16 2014 +0530
description:
idct8_c: optimization
Subject: [x265] idct16_c: optimization
details: http://hg.videolan.org/x265/rev/388c893d3825
branches:
changeset: 8886:388c893d3825
user: Praveen Tiwari
date: Fri Nov 21 11:31:27 2014 +0530
description:
idct16_c: optimization
Subject: [x265] idct32_c: C code optimization
details: http://hg.videolan.org/x265/rev/346fccbba4de
branches:
changeset: 8887:346fccbba4de
user: Praveen Tiwari
date: Fri Nov 21 12:03:54 2014 +0530
description:
idct32_c: C code optimization
diffstat:
source/common/dct.cpp | 129 ++--------------------------------
source/common/pixel.cpp | 3 +-
source/common/x86/asm-primitives.cpp | 2 +-
source/common/x86/blockcopy8.asm | 26 +++---
source/common/x86/blockcopy8.h | 2 +-
source/encoder/analysis.cpp | 43 +++++++----
source/encoder/analysis.h | 1 +
source/encoder/search.cpp | 32 +++++---
source/encoder/search.h | 2 +-
9 files changed, 74 insertions(+), 166 deletions(-)
diffs (truncated from 479 to 300 lines):
diff -r 2abf89f5c4f2 -r 346fccbba4de source/common/dct.cpp
--- a/source/common/dct.cpp Thu Nov 20 14:31:04 2014 -0800
+++ b/source/common/dct.cpp Fri Nov 21 12:03:54 2014 +0530
@@ -454,18 +454,7 @@ void dst4_c(const int16_t *src, int16_t
}
fastForwardDst(block, coef, shift_1st);
- fastForwardDst(coef, block, shift_2nd);
-
-#define N (4)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ fastForwardDst(coef, dst, shift_2nd);
}
void dct4_c(const int16_t *src, int16_t *dst, intptr_t stride)
@@ -482,17 +471,7 @@ void dct4_c(const int16_t *src, int16_t
}
partialButterfly4(block, coef, shift_1st, 4);
- partialButterfly4(coef, block, shift_2nd, 4);
-#define N (4)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ partialButterfly4(coef, dst, shift_2nd, 4);
}
void dct8_c(const int16_t *src, int16_t *dst, intptr_t stride)
@@ -509,18 +488,7 @@ void dct8_c(const int16_t *src, int16_t
}
partialButterfly8(block, coef, shift_1st, 8);
- partialButterfly8(coef, block, shift_2nd, 8);
-
-#define N (8)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ partialButterfly8(coef, dst, shift_2nd, 8);
}
void dct16_c(const int16_t *src, int16_t *dst, intptr_t stride)
@@ -537,18 +505,7 @@ void dct16_c(const int16_t *src, int16_t
}
partialButterfly16(block, coef, shift_1st, 16);
- partialButterfly16(coef, block, shift_2nd, 16);
-
-#define N (16)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ partialButterfly16(coef, dst, shift_2nd, 16);
}
void dct32_c(const int16_t *src, int16_t *dst, intptr_t stride)
@@ -565,18 +522,7 @@ void dct32_c(const int16_t *src, int16_t
}
partialButterfly32(block, coef, shift_1st, 32);
- partialButterfly32(coef, block, shift_2nd, 32);
-
-#define N (32)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- dst[i * N + j] = block[i * N + j];
- }
- }
-
-#undef N
+ partialButterfly32(coef, dst, shift_2nd, 32);
}
void idst4_c(const int16_t *src, int16_t *dst, intptr_t stride)
@@ -587,18 +533,7 @@ void idst4_c(const int16_t *src, int16_t
ALIGN_VAR_32(int16_t, coef[4 * 4]);
ALIGN_VAR_32(int16_t, block[4 * 4]);
-#define N (4)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- inversedst(block, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
+ inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output
for (int i = 0; i < 4; i++)
@@ -615,18 +550,7 @@ void idct4_c(const int16_t *src, int16_t
ALIGN_VAR_32(int16_t, coef[4 * 4]);
ALIGN_VAR_32(int16_t, block[4 * 4]);
-#define N (4)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- partialButterflyInverse4(block, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
+ partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
for (int i = 0; i < 4; i++)
@@ -643,18 +567,7 @@ void idct8_c(const int16_t *src, int16_t
ALIGN_VAR_32(int16_t, coef[8 * 8]);
ALIGN_VAR_32(int16_t, block[8 * 8]);
-#define N (8)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- partialButterflyInverse8(block, coef, shift_1st, 8);
+ partialButterflyInverse8(src, coef, shift_1st, 8);
partialButterflyInverse8(coef, block, shift_2nd, 8);
for (int i = 0; i < 8; i++)
{
@@ -670,18 +583,7 @@ void idct16_c(const int16_t *src, int16_
ALIGN_VAR_32(int16_t, coef[16 * 16]);
ALIGN_VAR_32(int16_t, block[16 * 16]);
-#define N (16)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- partialButterflyInverse16(block, coef, shift_1st, 16);
+ partialButterflyInverse16(src, coef, shift_1st, 16);
partialButterflyInverse16(coef, block, shift_2nd, 16);
for (int i = 0; i < 16; i++)
{
@@ -697,18 +599,7 @@ void idct32_c(const int16_t *src, int16_
ALIGN_VAR_32(int16_t, coef[32 * 32]);
ALIGN_VAR_32(int16_t, block[32 * 32]);
-#define N (32)
- for (int i = 0; i < N; i++)
- {
- for (int j = 0; j < N; j++)
- {
- block[i * N + j] = (int16_t)src[i * N + j];
- }
- }
-
-#undef N
-
- partialButterflyInverse32(block, coef, shift_1st, 32);
+ partialButterflyInverse32(src, coef, shift_1st, 32);
partialButterflyInverse32(coef, block, shift_2nd, 32);
for (int i = 0; i < 32; i++)
diff -r 2abf89f5c4f2 -r 346fccbba4de source/common/pixel.cpp
--- a/source/common/pixel.cpp Thu Nov 20 14:31:04 2014 -0800
+++ b/source/common/pixel.cpp Fri Nov 21 12:03:54 2014 +0530
@@ -493,11 +493,12 @@ void blockfil_s_c(int16_t* dst, intptr_t
void copy16to16_shl(int16_t *dst, const int16_t *src, intptr_t stride, int shift, int size)
{
+ X265_CHECK(!(size & 3), "invalid size\n");
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
{
- dst[i * size + j] = (src[i * stride + j]) << shift;
+ dst[i * size + j] = src[i * stride + j] << shift;
}
}
}
diff -r 2abf89f5c4f2 -r 346fccbba4de source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Nov 20 14:31:04 2014 -0800
+++ b/source/common/x86/asm-primitives.cpp Fri Nov 21 12:03:54 2014 +0530
@@ -1548,6 +1548,7 @@ void Setup_Assembly_Primitives(EncoderPr
p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
SA8D_INTER_FROM_BLOCK(sse2);
+ p.cpy16to16_shl = x265_copy16to16_shl_sse2;
p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
@@ -1614,7 +1615,6 @@ void Setup_Assembly_Primitives(EncoderPr
LUMA_ADDAVG(_sse4);
CHROMA_ADDAVG(_sse4);
CHROMA_ADDAVG_422(_sse4);
- p.cpy16to16_shl = x265_copy16to16_shl_sse4;
p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
diff -r 2abf89f5c4f2 -r 346fccbba4de source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Thu Nov 20 14:31:04 2014 -0800
+++ b/source/common/x86/blockcopy8.asm Fri Nov 21 12:03:54 2014 +0530
@@ -3672,37 +3672,35 @@ BLOCKCOPY_SS_W64_H4_avx 64, 64
;--------------------------------------------------------------------------------------
; void copy16to16_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal copy16to16_shl, 5, 7, 2, dst, src, stride, shift, size
+INIT_XMM sse2
+cglobal copy16to16_shl, 5, 6, 2, dst, src, stride, shift, size
%define shift m1
; make shift
- mov r5d, r3m
- movd shift, r5d
+ movd shift, r3d
; register alloc
; r0 - dst
; r1 - src
; r2 - stride
- ; r3 - shift
; r4 - size
sub r2d, r4d
add r2d, r2d
mov r5d, r4d
- shr r4d, 3
+ shr r4d, 2
.loop_row:
- mov r6d, r4d
+ mov r3d, r4d
.loop_col:
- movu m0, [r1]
+ movh m0, [r1]
psllw m0, shift
- movu [r0], m0
-
- add r1, 16
- add r0, 16
-
- dec r6d
+ movh [r0], m0
+
+ add r1, 8
+ add r0, 8
+
+ dec r3d
jnz .loop_col
add r1, r2
diff -r 2abf89f5c4f2 -r 346fccbba4de source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Thu Nov 20 14:31:04 2014 -0800
+++ b/source/common/x86/blockcopy8.h Fri Nov 21 12:03:54 2014 +0530
@@ -32,7 +32,7 @@ void x265_cvt32to16_shl_4_avx2(int16_t*
void x265_cvt32to16_shl_8_avx2(int16_t* dst, const int* src, intptr_t, int);
void x265_cvt32to16_shl_16_avx2(int16_t* dst, const int* src, intptr_t, int);
void x265_cvt32to16_shl_32_avx2(int16_t* dst, const int* src, intptr_t, int);
-void x265_copy16to16_shl_sse4(int16_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
+void x265_copy16to16_shl_sse2(int16_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
More information about the x265-commits
mailing list