[x265-commits] [x265] testbench support for addAvg primitive
Dnyaneshwar G
dnyaneshwar at multicorewareinc.com
Tue Jan 21 04:26:20 CET 2014
details: http://hg.videolan.org/x265/rev/56ce4f7669c6
branches:
changeset: 5844:56ce4f7669c6
user: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date: Sat Jan 18 12:01:44 2014 +0530
description:
testbench support for addAvg primitive
Subject: [x265] asm: IntraAng32x32 Mode[17]
details: http://hg.videolan.org/x265/rev/9f7fca027b41
branches:
changeset: 5845:9f7fca027b41
user: Min Chen <chenm003 at 163.com>
date: Fri Jan 17 14:36:07 2014 +0800
description:
asm: IntraAng32x32 Mode[17]
Subject: [x265] Nalwrite: removed EMULATION_SIZE macro and calculate the emulationSize from Encoded bitstream size
details: http://hg.videolan.org/x265/rev/356d91e22b25
branches:
changeset: 5846:356d91e22b25
user: Gopu Govindaswamy
date: Mon Jan 20 15:36:02 2014 -0800
description:
Nalwrite: removed EMULATION_SIZE macro and calculate the emulationSize from Encoded bitstream size
Subject: [x265] NALwrite: remove unused macro
details: http://hg.videolan.org/x265/rev/cf79f89c783c
branches:
changeset: 5847:cf79f89c783c
user: Deepthi Nandakumar <deepthi at multicorewareinc.com>
date: Mon Jan 20 18:33:40 2014 +0530
description:
NALwrite: remove unused macro
Subject: [x265] lookahead: call sliceTypeAnalyse when necessary
details: http://hg.videolan.org/x265/rev/bca352c8689e
branches:
changeset: 5848:bca352c8689e
user: Aarthi Thirumalai
date: Fri Jan 17 18:45:35 2014 +0530
description:
lookahead: call sliceTypeAnalyse when necessary
call sliceTypeAnalyse even when cutree is on or lookaheadDepth or
scenecutThreashold > 0
performs lookahead when lookaheadDepth > 0, activates lookahead for cutree when
b-adapt/bframes =0 and cutree is set , also enables scenecut for bframes = 0
cases. improves psnr/ssim by .5 dB.
Subject: [x265] rc: avoid issues from zero-residual lookahead blocks, introduce a small bias
details: http://hg.videolan.org/x265/rev/ffb53cd1f953
branches:
changeset: 5849:ffb53cd1f953
user: Aarthi Thirumalai
date: Fri Jan 17 18:47:42 2014 +0530
description:
rc: avoid issues from zero-residual lookahead blocks, introduce a small bias
Subject: [x265] Nalwrite: removed EMULATION_SIZE macro and calculate the emulationSize from Encoded bitstream size
details: http://hg.videolan.org/x265/rev/21a5fb7ab965
branches: stable
changeset: 5850:21a5fb7ab965
user: Gopu Govindaswamy
date: Mon Jan 20 15:36:02 2014 -0800
description:
Nalwrite: removed EMULATION_SIZE macro and calculate the emulationSize from Encoded bitstream size
Subject: [x265] Merge with stable
details: http://hg.videolan.org/x265/rev/b5b7d8e64024
branches:
changeset: 5851:b5b7d8e64024
user: Steve Borho <steve at borho.org>
date: Mon Jan 20 14:17:39 2014 -0600
description:
Merge with stable
Subject: [x265] encoder: fix the slicetype char table
details: http://hg.videolan.org/x265/rev/7bfd1b01953c
branches:
changeset: 5852:7bfd1b01953c
user: Steve Borho <steve at borho.org>
date: Mon Jan 20 14:27:15 2014 -0600
description:
encoder: fix the slicetype char table
Subject: [x265] slicetype: white-space fixes
details: http://hg.videolan.org/x265/rev/925e612b0591
branches:
changeset: 5853:925e612b0591
user: Steve Borho <steve at borho.org>
date: Mon Jan 20 14:50:05 2014 -0600
description:
slicetype: white-space fixes
Subject: [x265] cli: tweak aq-strength CLI help
details: http://hg.videolan.org/x265/rev/950c9a864cb6
branches:
changeset: 5854:950c9a864cb6
user: Steve Borho <steve at borho.org>
date: Mon Jan 20 21:22:10 2014 -0600
description:
cli: tweak aq-strength CLI help
diffstat:
source/Lib/TLibEncoder/NALwrite.cpp | 3 +-
source/common/common.h | 1 -
source/common/x86/asm-primitives.cpp | 4 +
source/common/x86/intrapred8.asm | 155 +++++++++++++
source/encoder/encoder.cpp | 6 +-
source/encoder/slicetype.cpp | 404 +++++++++++++---------------------
source/encoder/slicetype.h | 3 +-
source/test/pixelharness.cpp | 54 ++++
source/test/pixelharness.h | 1 +
source/x265.cpp | 2 +-
10 files changed, 374 insertions(+), 259 deletions(-)
diffs (truncated from 850 to 300 lines):
diff -r c88314c4a1a1 -r 950c9a864cb6 source/Lib/TLibEncoder/NALwrite.cpp
--- a/source/Lib/TLibEncoder/NALwrite.cpp Fri Jan 17 12:18:25 2014 +0530
+++ b/source/Lib/TLibEncoder/NALwrite.cpp Mon Jan 20 21:22:10 2014 -0600
@@ -84,7 +84,8 @@ void write(uint8_t*& out, OutputNALUnit&
*/
uint32_t fsize = nalu.m_bitstream.getByteStreamLength();
uint8_t* fifo = nalu.m_bitstream.getFIFO();
- uint8_t* emulation = (uint8_t*)X265_MALLOC(uint8_t, fsize + EMULATION_SIZE);
+ uint32_t emulationSize = fsize / 2;
+ uint8_t* emulation = (uint8_t*)X265_MALLOC(uint8_t, fsize + emulationSize);
uint32_t nalsize = 0;
if (emulation)
diff -r c88314c4a1a1 -r 950c9a864cb6 source/common/common.h
--- a/source/common/common.h Fri Jan 17 12:18:25 2014 +0530
+++ b/source/common/common.h Mon Jan 20 21:22:10 2014 -0600
@@ -70,7 +70,6 @@
#define MAX_NAL_UNITS 5
#define MIN_FIFO_SIZE 1000
-#define EMULATION_SIZE 1000
#define CHECKED_MALLOC(var, type, count) \
{ \
diff -r c88314c4a1a1 -r 950c9a864cb6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Jan 17 12:18:25 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 20 21:22:10 2014 -0600
@@ -553,6 +553,8 @@ extern "C" {
p.intra_pred[BLOCK_8x8][mode] = x265_intra_pred_ang8_ ## fno ## _ ## cpu;
#define SETUP_INTRA_ANG16(mode, fno, cpu) \
p.intra_pred[BLOCK_16x16][mode] = x265_intra_pred_ang16_ ## fno ## _ ## cpu;
+#define SETUP_INTRA_ANG32(mode, fno, cpu) \
+ p.intra_pred[BLOCK_32x32][mode] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
#define SETUP_INTRA_ANG32(mode, fno, cpu) \
p.intra_pred[BLOCK_32x32][mode] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
@@ -1011,6 +1013,8 @@ void Setup_Assembly_Primitives(EncoderPr
SETUP_INTRA_ANG4(32, 4, sse4);
SETUP_INTRA_ANG4(33, 3, sse4);
+ SETUP_INTRA_ANG32(17, 17, sse4);
+
p.dct[DCT_8x8] = x265_dct8_sse4;
}
if (cpuMask & X265_CPU_AVX)
diff -r c88314c4a1a1 -r 950c9a864cb6 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Fri Jan 17 12:18:25 2014 +0530
+++ b/source/common/x86/intrapred8.asm Mon Jan 20 21:22:10 2014 -0600
@@ -31,6 +31,9 @@ pb_0_8 times 8 db 0, 8
pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8
tab_Si: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+c_mode32_17_0: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0
+c_shuf8_0: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+c_deinterval8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
const ang_table
%assign x 0
@@ -1344,6 +1347,158 @@ cglobal intra_pred_ang32_2, 3,4,4
movu [r0 + r3 + 16], m2
RET
+
+; Process Intra32x32, input 9x4 in [m0, m1, m2, m3], output 4x8
+%macro PROC32_8x4 5 ; col4, c0, c1, c2, c3
+ %if %2 == 0
+ pmovzxbw m0, m0
+ %else
+ pshufb m0, [r3]
+ pmaddubsw m0, [r4 + %2 * 16]
+ pmulhrsw m0, m5 ; [07 06 05 04 03 02 01 00]
+ %endif
+ %if %3 == 0
+ pmovzxbw m1, m1
+ %else
+ pshufb m1, [r3]
+ pmaddubsw m1, [r4 + %3 * 16]
+ pmulhrsw m1, m5 ; [17 16 15 14 13 12 11 10]
+ %endif
+ %if %4 == 0
+ pmovzxbw m2, m2
+ %else
+ pshufb m2, [r3]
+ pmaddubsw m2, [r4 + %4 * 16]
+ pmulhrsw m2, m5 ; [27 26 25 24 23 22 21 20]
+ %endif
+ %if %5 == 0
+ pmovzxbw m3, m3
+ %else
+ pshufb m3, [r3]
+ pmaddubsw m3, [r4 + %5 * 16]
+ pmulhrsw m3, m5 ; [37 36 35 34 33 32 31 30]
+ %endif
+
+ ; transpose
+ packuswb m0, m2 ; [27 26 25 24 23 22 21 20 07 06 05 04 03 02 01 00]
+ packuswb m1, m3 ; [37 36 35 34 33 32 31 30 17 16 15 14 13 12 11 10]
+ pshufb m0, m6 ; [27 07 26 06 25 05 24 04 23 03 22 02 21 01 20 00]
+ pshufb m1, m6 ; [37 17 36 16 35 15 34 14 33 13 32 12 31 11 30 10]
+ punpcklbw m2, m0, m1 ; [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
+ punpckhbw m0, m1 ; [37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04]
+
+ ; store
+ movd [r0 + + %1 * 4], m2
+ pextrd [r0 + r1 + %1 * 4], m2, 1
+ pextrd [r0 + r1*2 + %1 * 4], m2, 2
+ pextrd [r0 + r5 + %1 * 4], m2, 3
+ movd [r6 + %1 * 4], m0
+ pextrd [r6 + r1 + %1 * 4], m0, 1
+ pextrd [r6 + r1*2 + %1 * 4], m0, 2
+ pextrd [r6 + r5 + %1 * 4], m0, 3
+%endmacro
+
+
+;-----------------------------------------------------------------------------
+; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_ang32_17, 4,7,8
+ ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
+ mov r6, rsp
+ sub rsp, 64+gprsize
+ and rsp, ~63
+ mov [rsp+64], r6
+
+ ; collect reference pixel
+ movu m0, [r3]
+ movu m1, [r3 + 16]
+ pshufb m0, [c_mode32_17_0]
+ pshufb m1, [c_mode32_17_0]
+ mova [rsp ], m1
+ movu [rsp + 13], m0
+ movu m0, [r2 + 1]
+ movu m1, [r2 + 1 + 16]
+ movu [rsp + 26], m0
+ movu [rsp + 26 + 16], m1
+ mov [rsp + 63], byte 4
+
+ ; filter
+ lea r2, [rsp + 25] ; r2 -> [0]
+ lea r3, [c_shuf8_0] ; r3 -> shuffle8
+ lea r4, [ang_table] ; r4 -> ang_table
+ lea r5, [r1 * 3] ; r5 -> 3 * stride
+ lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
+ mova m5, [pw_1024] ; m5 -> 1024
+ mova m6, [c_deinterval8] ; m6 -> c_deinterval8
+
+.loop:
+ ; Row[0 - 3]
+ movu m3, [r2 - 3]
+ palignr m0, m3, 3
+ palignr m1, m3, 2
+ palignr m2, m3, 1
+ PROC32_8x4 0, 6,12,18,24
+
+ ; Row[4 - 7]
+ movu m3, [r2 - 6]
+ palignr m0, m3, 2
+ mova m1, m0
+ palignr m2, m3, 1
+ PROC32_8x4 1, 30,4,10,16
+
+ ; Row[8 - 11]
+ movu m3, [r2 - 9]
+ palignr m0, m3, 2
+ palignr m1, m3, 1
+ mova m2, m1
+ PROC32_8x4 2, 22,28,2,8
+
+ ; Row[12 - 15]
+ movu m3, [r2 - 12]
+ palignr m0, m3, 2
+ palignr m1, m3, 1
+ mova m2, m3
+ PROC32_8x4 3, 14,20,26,0
+
+ ; Row[16 - 19]
+ movu m3, [r2 - 16]
+ palignr m0, m3, 3
+ palignr m1, m3, 2
+ palignr m2, m3, 1
+ PROC32_8x4 4, 6,12,18,24
+
+ ; Row[20 - 23]
+ movu m3, [r2 - 19]
+ palignr m0, m3, 2
+ mova m1, m0
+ palignr m2, m3, 1
+ PROC32_8x4 5, 30,4,10,16
+
+ ; Row[24 - 27]
+ movu m3, [r2 - 22]
+ palignr m0, m3, 2
+ palignr m1, m3, 1
+ mova m2, m1
+ PROC32_8x4 6, 22,28,2,8
+
+ ; Row[28 - 31]
+ movu m3, [r2 - 25]
+ palignr m0, m3, 2
+ palignr m1, m3, 1
+ mova m2, m3
+ PROC32_8x4 7, 14,20,26,0
+
+ lea r0, [r6 + r1 * 4]
+ lea r6, [r6 + r1 * 8]
+ add r2, 8
+ dec byte [rsp + 63]
+ jnz .loop
+
+ mov rsp, [rsp+64]
+ RET
+
+
;-----------------------------------------------------------------------------
; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
;-----------------------------------------------------------------------------
diff -r c88314c4a1a1 -r 950c9a864cb6 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp Fri Jan 17 12:18:25 2014 +0530
+++ b/source/encoder/encoder.cpp Mon Jan 20 21:22:10 2014 -0600
@@ -543,10 +543,10 @@ void Encoder::printSummary()
if (sliceType == I_SLICE)
{
if (depth == (int)g_maxCUDepth - 1)
- len += sprintf(stats + len, " %dx%d: "LL"%%", cuSize/2, cuSize/2, cntIntraNxN);
+ len += sprintf(stats + len, " %dx%d: "LL"%%", cuSize / 2, cuSize / 2, cntIntraNxN);
}
}
- const char slicechars = "BPI";
+ const char slicechars[] = "BPI";
if (stats[0])
x265_log(¶m, X265_LOG_INFO, "%c%-2d: %s\n", slicechars[sliceType], cuSize, stats);
}
@@ -1324,7 +1324,7 @@ void Encoder::configure(x265_param *_par
_param->rc.aqStrength = 0.0;
}
- if (_param->bFrameAdaptive == 0 && _param->rc.cuTree)
+ if (_param->lookaheadDepth == 0 && _param->rc.cuTree)
{
x265_log(_param, X265_LOG_WARNING, "cuTree disabled, requires lookahead to be enabled\n");
_param->rc.cuTree = 0;
diff -r c88314c4a1a1 -r 950c9a864cb6 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Fri Jan 17 12:18:25 2014 +0530
+++ b/source/encoder/slicetype.cpp Mon Jan 20 21:22:10 2014 -0600
@@ -461,6 +461,8 @@ void LookaheadRow::estimateCUCost(Lowres
cuy > 0 && cuy < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
me.setSourcePU(pelOffset, cuSize, cuSize);
+ /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
+ int lowresPenalty = 4;
MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY],
&fenc->lowresMvs[1][p1 - b - 1][cuXY] };
@@ -585,6 +587,7 @@ void LookaheadRow::estimateCUCost(Lowres
primitives.transpose[nLog2SizeMinus2](buf_trans, me.fenc, FENC_STRIDE);
pixelcmp_t satd = primitives.satd[partitionFromSizes(cuSize, cuSize)];
int icost = me.COST_MAX, cost;
+ const int intraPenalty = 5 * lookAheadLambda;
for (uint32_t mode = 0; mode < 35; mode++)
{
if ((mode >= 2) && (mode < 18))
@@ -595,7 +598,7 @@ void LookaheadRow::estimateCUCost(Lowres
icost = cost;
}
- // TOOD: i_icost += intra_penalty + lowres_penalty;
+ icost += intraPenalty + lowresPenalty;
fenc->intraCost[cuXY] = icost;
fenc->rowSatds[0][0][cuy] += icost;
if (bFrameScoreCU)
@@ -605,6 +608,7 @@ void LookaheadRow::estimateCUCost(Lowres
costIntraAq += (icost * fenc->invQscaleFactor[cuXY] + 128) >> 8;
}
}
+ bcost += lowresPenalty;
if (!bBidir)
{
if (fenc->intraCost[cuXY] < bcost)
@@ -631,292 +635,189 @@ void LookaheadRow::estimateCUCost(Lowres
void Lookahead::slicetypeDecide()
{
- if (cfg->param.bFrameAdaptive && cfg->param.lookaheadDepth && cfg->param.bframes)
+ if ((cfg->param.bFrameAdaptive && cfg->param.bframes) ||
+ cfg->param.rc.cuTree || cfg->param.scenecutThreshold ||
+ (cfg->param.lookaheadDepth && cfg->param.rc.vbvBufferSize))
{
slicetypeAnalyse(false);
+ }
- TComPic *list[X265_LOOKAHEAD_MAX];
- TComPic *ipic = inputQueue.first();
- int j;
- for (j = 0; ipic && j < cfg->param.bframes + 2; ipic = ipic->m_next)
+
+ TComPic *list[X265_LOOKAHEAD_MAX];
+ TComPic *ipic = inputQueue.first();
+ int j;
+ for (j = 0; ipic && j < cfg->param.bframes + 2; ipic = ipic->m_next)
More information about the x265-commits
mailing list