[x265-commits] [x265] assembly code for intra_pred_ang8_3

Yuvaraj Venkatesh yuvaraj at multicorewareinc.com
Tue Jan 21 23:41:36 CET 2014


details:   http://hg.videolan.org/x265/rev/b51c1866363d
branches:  
changeset: 5855:b51c1866363d
user:      Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
date:      Fri Jan 17 10:49:42 2014 +0530
description:
assembly code for intra_pred_ang8_3
Subject: [x265] Merge

details:   http://hg.videolan.org/x265/rev/ce41ee0f5c8c
branches:  
changeset: 5856:ce41ee0f5c8c
user:      Deepthi Nandakumar <deepthi at multicorewareinc.com>
date:      Tue Jan 21 10:11:44 2014 +0530
description:
Merge
Subject: [x265] yuv: skip frames one at a time to prevent offset overflow

details:   http://hg.videolan.org/x265/rev/e12bb1346bef
branches:  stable
changeset: 5857:e12bb1346bef
user:      Steve Borho <steve at borho.org>
date:      Tue Jan 21 12:30:58 2014 -0600
description:
yuv: skip frames one at a time to prevent offset overflow
Subject: [x265] slicetype: fix cuTree mv indexing (bug found by herman.chen at rock-chips.com)

details:   http://hg.videolan.org/x265/rev/3cf5a75a8002
branches:  stable
changeset: 5858:3cf5a75a8002
user:      Steve Borho <steve at borho.org>
date:      Tue Jan 21 12:58:25 2014 -0600
description:
slicetype: fix cuTree mv indexing (bug found by herman.chen at rock-chips.com)

This drops the bitrate almost 20% and the SSIM from 0.4-0.9 dB.  I believe this
needs rebalancing.
Subject: [x265] Merge with stable

details:   http://hg.videolan.org/x265/rev/0f0ad4c094bd
branches:  
changeset: 5859:0f0ad4c094bd
user:      Steve Borho <steve at borho.org>
date:      Tue Jan 21 16:39:44 2014 -0600
description:
Merge with stable

diffstat:

 source/common/x86/asm-primitives.cpp |    3 +
 source/common/x86/intrapred8.asm     |   95 ++++++++-
 source/encoder/encoder.cpp           |    6 +-
 source/encoder/slicetype.cpp         |  410 +++++++++++++---------------------
 source/encoder/slicetype.h           |    3 +-
 source/input/yuv.cpp                 |    5 +-
 source/x265.cpp                      |    2 +-
 7 files changed, 261 insertions(+), 263 deletions(-)

diffs (truncated from 703 to 300 lines):

diff -r cf79f89c783c -r 0f0ad4c094bd source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jan 20 18:33:40 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jan 21 16:39:44 2014 -0600
@@ -1015,6 +1015,9 @@ void Setup_Assembly_Primitives(EncoderPr
 
         SETUP_INTRA_ANG32(17, 17, sse4);
 
+        SETUP_INTRA_ANG8(3, 3, sse4);
+        SETUP_INTRA_ANG8(33, 3, sse4);
+
         p.dct[DCT_8x8] = x265_dct8_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
diff -r cf79f89c783c -r 0f0ad4c094bd source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Mon Jan 20 18:33:40 2014 +0530
+++ b/source/common/x86/intrapred8.asm	Tue Jan 21 16:39:44 2014 -0600
@@ -1109,7 +1109,7 @@ cglobal intra_pred_ang4_18, 4,4,1
     movd        [r0], m0
     RET
 ;-----------------------------------------------------------------------------
-; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
+; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
 ;-----------------------------------------------------------------------------
 INIT_XMM ssse3
 cglobal intra_pred_ang8_2, 3,5,2
@@ -1136,6 +1136,99 @@ cglobal intra_pred_ang8_2, 3,5,2
     movh        [r0 + r4],      m1
     RET
 
+INIT_XMM sse4
+cglobal intra_pred_ang8_3, 3,5,8
+    cmp         r4m,       byte 33
+    cmove       r2,        r3mp
+    lea         r3,        [ang_table + 14 * 16]
+    mova        m7,        [pw_1024]
+
+    movu        m0,        [r2 + 1]                   ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+    palignr     m1,        m0, 1                      ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
+
+    punpckhbw   m2,        m0, m1                     ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
+    punpcklbw   m0,        m1                         ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
+    palignr     m1,        m2, m0, 2                  ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
+
+    movu        m3,        [r3 + 12 * 16]             ; [26]
+    movu        m6,        [r3 + 6 * 16]              ; [20]
+
+    pmaddubsw   m4,        m0, m3
+    pmulhrsw    m4,        m7
+    pmaddubsw   m1,        m6
+    pmulhrsw    m1,        m7
+    packuswb    m4,        m1
+
+    palignr     m5,        m2, m0, 4                  ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
+
+    movu        m3,        [r3]                       ; [14]
+    pmaddubsw   m5,        m3
+    pmulhrsw    m5,        m7
+
+    palignr     m6,        m2, m0, 6                  ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
+
+    movu        m3,        [r3 - 6 * 16]              ; [ 8]
+    pmaddubsw   m6,        m3
+    pmulhrsw    m6,        m7
+    packuswb    m5,        m6
+
+    palignr     m1,        m2, m0, 8                  ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
+
+    movu        m3,        [r3 - 12 * 16]             ; [ 2]
+    pmaddubsw   m6,        m1, m3
+    pmulhrsw    m6,        m7
+
+    movu        m3,        [r3 + 14 * 16]             ; [28]
+    pmaddubsw   m1,        m3
+    pmulhrsw    m1,        m7
+    packuswb    m6,        m1
+
+    palignr     m1,        m2, m0, 10                 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
+
+    movu        m3,        [r3 + 8 * 16]              ; [22]
+    pmaddubsw   m1,        m3
+    pmulhrsw    m1,        m7
+
+    palignr     m2,        m0, 12                     ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
+
+    movu        m3,        [r3 + 2 * 16]              ; [16]
+    pmaddubsw   m2,        m3
+    pmulhrsw    m2,        m7
+    packuswb    m1,        m2
+
+    jz         .store
+
+    ; transpose 8x8
+    punpckhbw   m0,        m4, m5
+    punpcklbw   m4,        m5
+    punpckhbw   m2,        m4, m0
+    punpcklbw   m4,        m0
+
+    punpckhbw   m0,        m6, m1
+    punpcklbw   m6,        m1
+    punpckhbw   m1,        m6, m0
+    punpcklbw   m6,        m0
+
+    punpckhdq   m5,        m4, m6
+    punpckldq   m4,        m6
+    punpckldq   m6,        m2, m1
+    punpckhdq   m2,        m1
+    mova        m1,        m2
+
+.store:
+    lea         r4,              [r1 * 3]
+    movh        [r0],            m4
+    movhps      [r0 + r1],       m4
+    movh        [r0 + r1 * 2],   m5
+    movhps      [r0 + r4],       m5
+    lea         r0,              [r0 + r1 * 4]
+    movh        [r0],            m6
+    movhps      [r0 + r1],       m6
+    movh        [r0 + r1 * 2],   m1
+    movhps      [r0 + r4],       m1
+
+    RET
+
 ;-----------------------------------------------------------------------------
 ; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
 ;-----------------------------------------------------------------------------
diff -r cf79f89c783c -r 0f0ad4c094bd source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Mon Jan 20 18:33:40 2014 +0530
+++ b/source/encoder/encoder.cpp	Tue Jan 21 16:39:44 2014 -0600
@@ -543,10 +543,10 @@ void Encoder::printSummary()
                 if (sliceType == I_SLICE)
                 {
                     if (depth == (int)g_maxCUDepth - 1)
-                        len += sprintf(stats + len, " %dx%d: "LL"%%", cuSize/2, cuSize/2, cntIntraNxN);
+                        len += sprintf(stats + len, " %dx%d: "LL"%%", cuSize / 2, cuSize / 2, cntIntraNxN);
                 }
             }
-            const char slicechars = "BPI";
+            const char slicechars[] = "BPI";
             if (stats[0])
                 x265_log(&param, X265_LOG_INFO, "%c%-2d: %s\n", slicechars[sliceType], cuSize, stats);
         }
@@ -1324,7 +1324,7 @@ void Encoder::configure(x265_param *_par
         _param->rc.aqStrength = 0.0;
     }
 
-    if (_param->bFrameAdaptive == 0 && _param->rc.cuTree)
+    if (_param->lookaheadDepth == 0 && _param->rc.cuTree)
     {
         x265_log(_param, X265_LOG_WARNING, "cuTree disabled, requires lookahead to be enabled\n");
         _param->rc.cuTree = 0;
diff -r cf79f89c783c -r 0f0ad4c094bd source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Mon Jan 20 18:33:40 2014 +0530
+++ b/source/encoder/slicetype.cpp	Tue Jan 21 16:39:44 2014 -0600
@@ -461,6 +461,8 @@ void LookaheadRow::estimateCUCost(Lowres
                                 cuy > 0 && cuy < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
 
     me.setSourcePU(pelOffset, cuSize, cuSize);
+    /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
+    int lowresPenalty = 4;
 
     MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY],
                          &fenc->lowresMvs[1][p1 - b - 1][cuXY] };
@@ -585,6 +587,7 @@ void LookaheadRow::estimateCUCost(Lowres
         primitives.transpose[nLog2SizeMinus2](buf_trans, me.fenc, FENC_STRIDE);
         pixelcmp_t satd = primitives.satd[partitionFromSizes(cuSize, cuSize)];
         int icost = me.COST_MAX, cost;
+        const int intraPenalty = 5 * lookAheadLambda;
         for (uint32_t mode = 0; mode < 35; mode++)
         {
             if ((mode >= 2) && (mode < 18))
@@ -595,7 +598,7 @@ void LookaheadRow::estimateCUCost(Lowres
                 icost = cost;
         }
 
-        // TOOD: i_icost += intra_penalty + lowres_penalty;
+        icost += intraPenalty + lowresPenalty;
         fenc->intraCost[cuXY] = icost;
         fenc->rowSatds[0][0][cuy] += icost;
         if (bFrameScoreCU)
@@ -605,6 +608,7 @@ void LookaheadRow::estimateCUCost(Lowres
                 costIntraAq += (icost * fenc->invQscaleFactor[cuXY] + 128) >> 8;
         }
     }
+    bcost += lowresPenalty;
     if (!bBidir)
     {
         if (fenc->intraCost[cuXY] < bcost)
@@ -631,292 +635,189 @@ void LookaheadRow::estimateCUCost(Lowres
 
 void Lookahead::slicetypeDecide()
 {
-    if (cfg->param.bFrameAdaptive && cfg->param.lookaheadDepth && cfg->param.bframes)
+    if ((cfg->param.bFrameAdaptive && cfg->param.bframes) ||
+        cfg->param.rc.cuTree || cfg->param.scenecutThreshold ||
+        (cfg->param.lookaheadDepth && cfg->param.rc.vbvBufferSize))
     {
         slicetypeAnalyse(false);
+    }
 
-        TComPic *list[X265_LOOKAHEAD_MAX];
-        TComPic *ipic = inputQueue.first();
-        int j;
-        for (j = 0; ipic && j < cfg->param.bframes + 2; ipic = ipic->m_next)
+
+    TComPic *list[X265_LOOKAHEAD_MAX];
+    TComPic *ipic = inputQueue.first();
+    int j;
+    for (j = 0; ipic && j < cfg->param.bframes + 2; ipic = ipic->m_next)
+    {
+        list[j++] = ipic;
+    }
+
+    list[j] = NULL;
+
+    int bframes, brefs;
+    for (bframes = 0, brefs = 0;; bframes++)
+    {
+        Lowres& frm = list[bframes]->m_lowres;
+
+        if (frm.sliceType == X265_TYPE_BREF && !cfg->param.bBPyramid && brefs == cfg->param.bBPyramid)
         {
-            list[j++] = ipic;
+            frm.sliceType = X265_TYPE_B;
+            x265_log(&cfg->param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid\n",
+                     frm.frameNum);
         }
 
-        list[j] = NULL;
+        /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
+           smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it.*/
+        else if (frm.sliceType == X265_TYPE_BREF && cfg->param.bBPyramid && brefs &&
+                 cfg->param.maxNumReferences <= (brefs + 3))
+        {
+            frm.sliceType = X265_TYPE_B;
+            x265_log(&cfg->param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n",
+                     frm.sliceType, cfg->param.maxNumReferences);
+        }
 
-        int bframes, brefs;
-        for (bframes = 0, brefs = 0;; bframes++)
+        if (frm.sliceType == X265_TYPE_KEYFRAME)
+            frm.sliceType = cfg->param.bOpenGOP ? X265_TYPE_I : X265_TYPE_IDR;
+        if (( /* !cfg->param.intraRefresh || */ frm.frameNum == 0) && frm.frameNum - lastKeyframe >= cfg->param.keyframeMax)
         {
-            Lowres& frm = list[bframes]->m_lowres;
-
-            if (frm.sliceType == X265_TYPE_BREF && !cfg->param.bBPyramid && brefs == cfg->param.bBPyramid)
+            if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I)
+                frm.sliceType = cfg->param.bOpenGOP && lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
+            bool warn = frm.sliceType != X265_TYPE_IDR;
+            if (warn && cfg->param.bOpenGOP)
+                warn &= frm.sliceType != X265_TYPE_I;
+            if (warn)
             {
-                frm.sliceType = X265_TYPE_B;
-                x265_log(&cfg->param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid\n",
-                         frm.frameNum);
+                x265_log(&cfg->param, X265_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n",
+                         frm.sliceType, frm.frameNum);
+                frm.sliceType = cfg->param.bOpenGOP && lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
             }
-
-            /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
-               smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it.*/
-            else if (frm.sliceType == X265_TYPE_BREF && cfg->param.bBPyramid && brefs &&
-                     cfg->param.maxNumReferences <= (brefs + 3))
+        }
+        if (frm.sliceType == X265_TYPE_I && frm.frameNum - lastKeyframe >= cfg->param.keyframeMin)
+        {
+            if (cfg->param.bOpenGOP)
             {
-                frm.sliceType = X265_TYPE_B;
-                x265_log(&cfg->param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n",
-                         frm.sliceType, cfg->param.maxNumReferences);
-            }
-
-            if (frm.sliceType == X265_TYPE_KEYFRAME)
-                frm.sliceType = cfg->param.bOpenGOP ? X265_TYPE_I : X265_TYPE_IDR;
-            if (( /* !cfg->param.intraRefresh || */ frm.frameNum == 0) && frm.frameNum - lastKeyframe >= cfg->param.keyframeMax)
-            {
-                if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I)
-                    frm.sliceType = cfg->param.bOpenGOP && lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
-                bool warn = frm.sliceType != X265_TYPE_IDR;
-                if (warn && cfg->param.bOpenGOP)
-                    warn &= frm.sliceType != X265_TYPE_I;
-                if (warn)
-                {
-                    x265_log(&cfg->param, X265_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n", frm.sliceType, frm.frameNum);
-                    frm.sliceType = cfg->param.bOpenGOP && lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
-                }
-            }
-            if (frm.sliceType == X265_TYPE_I && frm.frameNum - lastKeyframe >= cfg->param.keyframeMin)
-            {
-                if (cfg->param.bOpenGOP)
-                {
-                    lastKeyframe = frm.frameNum;
-                    frm.bKeyframe = true;
-                }
-                else
-                    frm.sliceType = X265_TYPE_IDR;


More information about the x265-commits mailing list