[x265-commits] [x265] testbench support for addAvg primitive

Dnyaneshwar G dnyaneshwar at multicorewareinc.com
Tue Jan 21 04:26:20 CET 2014


details:   http://hg.videolan.org/x265/rev/56ce4f7669c6
branches:  
changeset: 5844:56ce4f7669c6
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Sat Jan 18 12:01:44 2014 +0530
description:
testbench support for addAvg primitive
Subject: [x265] asm: IntraAng32x32 Mode[17]

details:   http://hg.videolan.org/x265/rev/9f7fca027b41
branches:  
changeset: 5845:9f7fca027b41
user:      Min Chen <chenm003 at 163.com>
date:      Fri Jan 17 14:36:07 2014 +0800
description:
asm: IntraAng32x32 Mode[17]
Subject: [x265] Nalwrite: removed EMULATION_SIZE macro and calculate the emulationSize from Encoded bitstream size

details:   http://hg.videolan.org/x265/rev/356d91e22b25
branches:  
changeset: 5846:356d91e22b25
user:      Gopu Govindaswamy
date:      Mon Jan 20 15:36:02 2014 -0800
description:
Nalwrite: removed EMULATION_SIZE macro and calculate the emulationSize from Encoded bitstream size
Subject: [x265] NALwrite: remove unused macro

details:   http://hg.videolan.org/x265/rev/cf79f89c783c
branches:  
changeset: 5847:cf79f89c783c
user:      Deepthi Nandakumar <deepthi at multicorewareinc.com>
date:      Mon Jan 20 18:33:40 2014 +0530
description:
NALwrite: remove unused macro
Subject: [x265] lookahead: call sliceTypeAnalyse when necessary

details:   http://hg.videolan.org/x265/rev/bca352c8689e
branches:  
changeset: 5848:bca352c8689e
user:      Aarthi Thirumalai
date:      Fri Jan 17 18:45:35 2014 +0530
description:
lookahead: call sliceTypeAnalyse when necessary

call sliceTypeAnalyse even when cutree is on or lookaheadDepth or
scenecutThreashold > 0

performs lookahead when lookaheadDepth > 0, activates lookahead for cutree when
b-adapt/bframes =0 and cutree is set , also enables scenecut for bframes = 0
cases. improves psnr/ssim by .5 dB.
Subject: [x265] rc: avoid issues from zero-residual lookahead blocks, introduce a small bias

details:   http://hg.videolan.org/x265/rev/ffb53cd1f953
branches:  
changeset: 5849:ffb53cd1f953
user:      Aarthi Thirumalai
date:      Fri Jan 17 18:47:42 2014 +0530
description:
rc: avoid issues from zero-residual lookahead blocks, introduce a small bias
Subject: [x265] Nalwrite: removed EMULATION_SIZE macro and calculate the emulationSize from Encoded bitstream size

details:   http://hg.videolan.org/x265/rev/21a5fb7ab965
branches:  stable
changeset: 5850:21a5fb7ab965
user:      Gopu Govindaswamy
date:      Mon Jan 20 15:36:02 2014 -0800
description:
Nalwrite: removed EMULATION_SIZE macro and calculate the emulationSize from Encoded bitstream size
Subject: [x265] Merge with stable

details:   http://hg.videolan.org/x265/rev/b5b7d8e64024
branches:  
changeset: 5851:b5b7d8e64024
user:      Steve Borho <steve at borho.org>
date:      Mon Jan 20 14:17:39 2014 -0600
description:
Merge with stable
Subject: [x265] encoder: fix the slicetype char table

details:   http://hg.videolan.org/x265/rev/7bfd1b01953c
branches:  
changeset: 5852:7bfd1b01953c
user:      Steve Borho <steve at borho.org>
date:      Mon Jan 20 14:27:15 2014 -0600
description:
encoder: fix the slicetype char table
Subject: [x265] slicetype: white-space fixes

details:   http://hg.videolan.org/x265/rev/925e612b0591
branches:  
changeset: 5853:925e612b0591
user:      Steve Borho <steve at borho.org>
date:      Mon Jan 20 14:50:05 2014 -0600
description:
slicetype: white-space fixes
Subject: [x265] cli: tweak aq-strength CLI help

details:   http://hg.videolan.org/x265/rev/950c9a864cb6
branches:  
changeset: 5854:950c9a864cb6
user:      Steve Borho <steve at borho.org>
date:      Mon Jan 20 21:22:10 2014 -0600
description:
cli: tweak aq-strength CLI help

diffstat:

 source/Lib/TLibEncoder/NALwrite.cpp  |    3 +-
 source/common/common.h               |    1 -
 source/common/x86/asm-primitives.cpp |    4 +
 source/common/x86/intrapred8.asm     |  155 +++++++++++++
 source/encoder/encoder.cpp           |    6 +-
 source/encoder/slicetype.cpp         |  404 +++++++++++++---------------------
 source/encoder/slicetype.h           |    3 +-
 source/test/pixelharness.cpp         |   54 ++++
 source/test/pixelharness.h           |    1 +
 source/x265.cpp                      |    2 +-
 10 files changed, 374 insertions(+), 259 deletions(-)

diffs (truncated from 850 to 300 lines):

diff -r c88314c4a1a1 -r 950c9a864cb6 source/Lib/TLibEncoder/NALwrite.cpp
--- a/source/Lib/TLibEncoder/NALwrite.cpp	Fri Jan 17 12:18:25 2014 +0530
+++ b/source/Lib/TLibEncoder/NALwrite.cpp	Mon Jan 20 21:22:10 2014 -0600
@@ -84,7 +84,8 @@ void write(uint8_t*& out, OutputNALUnit&
      */
     uint32_t fsize = nalu.m_bitstream.getByteStreamLength();
     uint8_t* fifo = nalu.m_bitstream.getFIFO();
-    uint8_t* emulation = (uint8_t*)X265_MALLOC(uint8_t, fsize + EMULATION_SIZE);
+    uint32_t  emulationSize = fsize / 2;
+    uint8_t* emulation = (uint8_t*)X265_MALLOC(uint8_t, fsize + emulationSize);
     uint32_t nalsize = 0;
 
     if (emulation)
diff -r c88314c4a1a1 -r 950c9a864cb6 source/common/common.h
--- a/source/common/common.h	Fri Jan 17 12:18:25 2014 +0530
+++ b/source/common/common.h	Mon Jan 20 21:22:10 2014 -0600
@@ -70,7 +70,6 @@
 
 #define MAX_NAL_UNITS 5
 #define MIN_FIFO_SIZE 1000
-#define EMULATION_SIZE 1000
 
 #define CHECKED_MALLOC(var, type, count) \
     { \
diff -r c88314c4a1a1 -r 950c9a864cb6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Jan 17 12:18:25 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Jan 20 21:22:10 2014 -0600
@@ -553,6 +553,8 @@ extern "C" {
     p.intra_pred[BLOCK_8x8][mode] = x265_intra_pred_ang8_ ## fno ## _ ## cpu;
 #define SETUP_INTRA_ANG16(mode, fno, cpu) \
     p.intra_pred[BLOCK_16x16][mode] = x265_intra_pred_ang16_ ## fno ## _ ## cpu;
+#define SETUP_INTRA_ANG32(mode, fno, cpu) \
+    p.intra_pred[BLOCK_32x32][mode] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
 
 #define SETUP_INTRA_ANG32(mode, fno, cpu) \
     p.intra_pred[BLOCK_32x32][mode] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
@@ -1011,6 +1013,8 @@ void Setup_Assembly_Primitives(EncoderPr
         SETUP_INTRA_ANG4(32, 4, sse4);
         SETUP_INTRA_ANG4(33, 3, sse4);
 
+        SETUP_INTRA_ANG32(17, 17, sse4);
+
         p.dct[DCT_8x8] = x265_dct8_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
diff -r c88314c4a1a1 -r 950c9a864cb6 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Fri Jan 17 12:18:25 2014 +0530
+++ b/source/common/x86/intrapred8.asm	Mon Jan 20 21:22:10 2014 -0600
@@ -31,6 +31,9 @@ pb_0_8          times 8 db 0, 8
 pb_unpackbw1    times 2 db 1, 8, 2, 8, 3, 8, 4, 8
 
 tab_Si:  db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+c_mode32_17_0:  db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0
+c_shuf8_0:      db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+c_deinterval8:  db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
 
 const ang_table
 %assign x 0
@@ -1344,6 +1347,158 @@ cglobal intra_pred_ang32_2, 3,4,4
     movu            [r0 + r3 + 16], m2
     RET
 
+
+; Process Intra32x32, input 9x4 in [m0, m1, m2, m3], output 4x8
+%macro PROC32_8x4 5  ; col4, c0, c1, c2, c3
+  %if %2 == 0
+    pmovzxbw    m0, m0
+  %else
+    pshufb      m0, [r3]
+    pmaddubsw   m0, [r4 + %2 * 16]
+    pmulhrsw    m0, m5                  ; [07 06 05 04 03 02 01 00]
+  %endif
+  %if %3 == 0
+    pmovzxbw    m1, m1
+  %else
+    pshufb      m1, [r3]
+    pmaddubsw   m1, [r4 + %3 * 16]
+    pmulhrsw    m1, m5                  ; [17 16 15 14 13 12 11 10]
+  %endif
+  %if %4 == 0
+    pmovzxbw    m2, m2
+  %else
+    pshufb      m2, [r3]
+    pmaddubsw   m2, [r4 + %4 * 16]
+    pmulhrsw    m2, m5                  ; [27 26 25 24 23 22 21 20]
+  %endif
+  %if %5 == 0
+    pmovzxbw    m3, m3
+  %else
+    pshufb      m3, [r3]
+    pmaddubsw   m3, [r4 + %5 * 16]
+    pmulhrsw    m3, m5                  ; [37 36 35 34 33 32 31 30]
+  %endif
+
+    ; transpose
+    packuswb    m0, m2                  ; [27 26 25 24 23 22 21 20 07 06 05 04 03 02 01 00]
+    packuswb    m1, m3                  ; [37 36 35 34 33 32 31 30 17 16 15 14 13 12 11 10]
+    pshufb      m0, m6                  ; [27 07 26 06 25 05 24 04 23 03 22 02 21 01 20 00]
+    pshufb      m1, m6                  ; [37 17 36 16 35 15 34 14 33 13 32 12 31 11 30 10]
+    punpcklbw   m2, m0, m1              ; [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
+    punpckhbw   m0, m1                  ; [37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04]
+
+    ; store
+    movd        [r0 +       + %1 * 4], m2
+    pextrd      [r0 +  r1   + %1 * 4], m2, 1
+    pextrd      [r0 +  r1*2 + %1 * 4], m2, 2
+    pextrd      [r0 +  r5   + %1 * 4], m2, 3
+    movd        [r6         + %1 * 4], m0
+    pextrd      [r6 +  r1   + %1 * 4], m0, 1
+    pextrd      [r6 +  r1*2 + %1 * 4], m0, 2
+    pextrd      [r6 +  r5   + %1 * 4], m0, 3
+%endmacro
+
+
+;-----------------------------------------------------------------------------
+; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_ang32_17, 4,7,8
+    ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
+    mov         r6, rsp
+    sub         rsp, 64+gprsize
+    and         rsp, ~63
+    mov         [rsp+64], r6
+
+    ; collect reference pixel
+    movu        m0, [r3]
+    movu        m1, [r3 + 16]
+    pshufb      m0, [c_mode32_17_0]
+    pshufb      m1, [c_mode32_17_0]
+    mova        [rsp     ], m1
+    movu        [rsp + 13], m0
+    movu        m0, [r2 + 1]
+    movu        m1, [r2 + 1 + 16]
+    movu        [rsp + 26], m0
+    movu        [rsp + 26 + 16], m1
+    mov         [rsp + 63], byte 4
+
+    ; filter
+    lea         r2, [rsp + 25]          ; r2 -> [0]
+    lea         r3, [c_shuf8_0]         ; r3 -> shuffle8
+    lea         r4, [ang_table]         ; r4 -> ang_table
+    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
+    lea         r6, [r0 + r1 * 4]       ; r6 -> 4 * stride
+    mova        m5, [pw_1024]           ; m5 -> 1024
+    mova        m6, [c_deinterval8]     ; m6 -> c_deinterval8
+
+.loop:
+    ; Row[0 - 3]
+    movu        m3, [r2 - 3]
+    palignr     m0, m3, 3
+    palignr     m1, m3, 2
+    palignr     m2, m3, 1
+    PROC32_8x4  0, 6,12,18,24
+
+    ; Row[4 - 7]
+    movu        m3, [r2 - 6]
+    palignr     m0, m3, 2
+    mova        m1, m0
+    palignr     m2, m3, 1
+    PROC32_8x4  1, 30,4,10,16
+
+    ; Row[8 - 11]
+    movu        m3, [r2 - 9]
+    palignr     m0, m3, 2
+    palignr     m1, m3, 1
+    mova        m2, m1
+    PROC32_8x4  2, 22,28,2,8
+
+    ; Row[12 - 15]
+    movu        m3, [r2 - 12]
+    palignr     m0, m3, 2
+    palignr     m1, m3, 1
+    mova        m2, m3
+    PROC32_8x4  3, 14,20,26,0
+
+    ; Row[16 - 19]
+    movu        m3, [r2 - 16]
+    palignr     m0, m3, 3
+    palignr     m1, m3, 2
+    palignr     m2, m3, 1
+    PROC32_8x4  4, 6,12,18,24
+
+    ; Row[20 - 23]
+    movu        m3, [r2 - 19]
+    palignr     m0, m3, 2
+    mova        m1, m0
+    palignr     m2, m3, 1
+    PROC32_8x4  5, 30,4,10,16
+
+    ; Row[24 - 27]
+    movu        m3, [r2 - 22]
+    palignr     m0, m3, 2
+    palignr     m1, m3, 1
+    mova        m2, m1
+    PROC32_8x4  6, 22,28,2,8
+
+    ; Row[28 - 31]
+    movu        m3, [r2 - 25]
+    palignr     m0, m3, 2
+    palignr     m1, m3, 1
+    mova        m2, m3
+    PROC32_8x4  7, 14,20,26,0
+
+    lea         r0, [r6 + r1 * 4]
+    lea         r6, [r6 + r1 * 8]
+    add         r2, 8
+    dec         byte [rsp + 63]
+    jnz        .loop
+
+    mov         rsp, [rsp+64]
+    RET
+
+
 ;-----------------------------------------------------------------------------
 ; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
 ;-----------------------------------------------------------------------------
diff -r c88314c4a1a1 -r 950c9a864cb6 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Fri Jan 17 12:18:25 2014 +0530
+++ b/source/encoder/encoder.cpp	Mon Jan 20 21:22:10 2014 -0600
@@ -543,10 +543,10 @@ void Encoder::printSummary()
                 if (sliceType == I_SLICE)
                 {
                     if (depth == (int)g_maxCUDepth - 1)
-                        len += sprintf(stats + len, " %dx%d: "LL"%%", cuSize/2, cuSize/2, cntIntraNxN);
+                        len += sprintf(stats + len, " %dx%d: "LL"%%", cuSize / 2, cuSize / 2, cntIntraNxN);
                 }
             }
-            const char slicechars = "BPI";
+            const char slicechars[] = "BPI";
             if (stats[0])
                 x265_log(&param, X265_LOG_INFO, "%c%-2d: %s\n", slicechars[sliceType], cuSize, stats);
         }
@@ -1324,7 +1324,7 @@ void Encoder::configure(x265_param *_par
         _param->rc.aqStrength = 0.0;
     }
 
-    if (_param->bFrameAdaptive == 0 && _param->rc.cuTree)
+    if (_param->lookaheadDepth == 0 && _param->rc.cuTree)
     {
         x265_log(_param, X265_LOG_WARNING, "cuTree disabled, requires lookahead to be enabled\n");
         _param->rc.cuTree = 0;
diff -r c88314c4a1a1 -r 950c9a864cb6 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Fri Jan 17 12:18:25 2014 +0530
+++ b/source/encoder/slicetype.cpp	Mon Jan 20 21:22:10 2014 -0600
@@ -461,6 +461,8 @@ void LookaheadRow::estimateCUCost(Lowres
                                 cuy > 0 && cuy < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
 
     me.setSourcePU(pelOffset, cuSize, cuSize);
+    /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
+    int lowresPenalty = 4;
 
     MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY],
                          &fenc->lowresMvs[1][p1 - b - 1][cuXY] };
@@ -585,6 +587,7 @@ void LookaheadRow::estimateCUCost(Lowres
         primitives.transpose[nLog2SizeMinus2](buf_trans, me.fenc, FENC_STRIDE);
         pixelcmp_t satd = primitives.satd[partitionFromSizes(cuSize, cuSize)];
         int icost = me.COST_MAX, cost;
+        const int intraPenalty = 5 * lookAheadLambda;
         for (uint32_t mode = 0; mode < 35; mode++)
         {
             if ((mode >= 2) && (mode < 18))
@@ -595,7 +598,7 @@ void LookaheadRow::estimateCUCost(Lowres
                 icost = cost;
         }
 
-        // TOOD: i_icost += intra_penalty + lowres_penalty;
+        icost += intraPenalty + lowresPenalty;
         fenc->intraCost[cuXY] = icost;
         fenc->rowSatds[0][0][cuy] += icost;
         if (bFrameScoreCU)
@@ -605,6 +608,7 @@ void LookaheadRow::estimateCUCost(Lowres
                 costIntraAq += (icost * fenc->invQscaleFactor[cuXY] + 128) >> 8;
         }
     }
+    bcost += lowresPenalty;
     if (!bBidir)
     {
         if (fenc->intraCost[cuXY] < bcost)
@@ -631,292 +635,189 @@ void LookaheadRow::estimateCUCost(Lowres
 
 void Lookahead::slicetypeDecide()
 {
-    if (cfg->param.bFrameAdaptive && cfg->param.lookaheadDepth && cfg->param.bframes)
+    if ((cfg->param.bFrameAdaptive && cfg->param.bframes) ||
+        cfg->param.rc.cuTree || cfg->param.scenecutThreshold ||
+        (cfg->param.lookaheadDepth && cfg->param.rc.vbvBufferSize))
     {
         slicetypeAnalyse(false);
+    }
 
-        TComPic *list[X265_LOOKAHEAD_MAX];
-        TComPic *ipic = inputQueue.first();
-        int j;
-        for (j = 0; ipic && j < cfg->param.bframes + 2; ipic = ipic->m_next)
+
+    TComPic *list[X265_LOOKAHEAD_MAX];
+    TComPic *ipic = inputQueue.first();
+    int j;
+    for (j = 0; ipic && j < cfg->param.bframes + 2; ipic = ipic->m_next)


More information about the x265-commits mailing list