[x265-commits] [x265] asm: fix invalid read in upShift routine

Thu Apr 3 22:03:59 CEST 2014

details:   http://hg.videolan.org/x265/rev/82bbd2bf3b49
branches:  stable
changeset: 6657:82bbd2bf3b49
user:      Murugan Vairavel <murugan at multicorewareinc.com>
date:      Thu Apr 03 11:30:44 2014 +0530
description:
asm: fix invalid read in upShift routine
Subject: [x265] Added tag 0.9 for changeset 82bbd2bf3b49

details:   http://hg.videolan.org/x265/rev/640f9177eeb0
branches:  stable
changeset: 6658:640f9177eeb0
user:      Steve Borho <steve at borho.org>
date:      Thu Apr 03 11:54:16 2014 -0500
description:
Added tag 0.9 for changeset 82bbd2bf3b49
Subject: [x265] frameencoder: store the reference state of the picture in FrameEncoder

details:   http://hg.videolan.org/x265/rev/36a66ea7a27e
branches:  
changeset: 6659:36a66ea7a27e
user:      Gopu Govindaswamy
date:      Thu Apr 03 17:24:57 2014 +0530
description:
frameencoder: store the reference state of the picture in FrameEncoder

We find that reference state of the reference frame changed during the encode
when we use frame-thread > 1 this cause the CU level QP for the frame is
non-deterministic, this is leading the non-deterministic encoded output for the
frame, to avoid this store the reference state of the frame to
FrameEncoder->m_isReferenced and when the QP is calculate for CU, refer the
reference state of the frame from FrameEncoder->m_isReferenced this stat will
never change during the encode

Moved slice reference state initialization from dpb to FrameEncoder initSlice()
Subject: [x265] Merge with stable

details:   http://hg.videolan.org/x265/rev/eef9a0050728
branches:  
changeset: 6660:eef9a0050728
user:      Steve Borho <steve at borho.org>
date:      Thu Apr 03 11:56:58 2014 -0500
description:
Merge with stable
Subject: [x265] frameencoder: comment nit

details:   http://hg.videolan.org/x265/rev/9c1cc2aa053a
branches:  
changeset: 6661:9c1cc2aa053a
user:      Steve Borho <steve at borho.org>
date:      Thu Apr 03 14:47:56 2014 -0500
description:
frameencoder: comment nit
Subject: [x265] frameencoder: use m_isReferenced when configuring SAO in compressFrame()

details:   http://hg.videolan.org/x265/rev/8c946aca5824
branches:  
changeset: 6662:8c946aca5824
user:      Steve Borho <steve at borho.org>
date:      Thu Apr 03 14:49:57 2014 -0500
description:
frameencoder: use m_isReferenced when configuring SAO in compressFrame()

In some pessimal situations, the slice's reference state could even be changed
by the time compressFrame() starts.  This prevents any race hazard.

diffstat:

 .hgtags                               |    1 +
 source/Lib/TLibCommon/TComDataCU.cpp  |   24 -
 source/Lib/TLibCommon/TComDataCU.h    |    3 -
 source/Lib/TLibEncoder/TEncSearch.cpp |    4 +-
 source/common/pixel.cpp               |    4 +-
 source/common/primitives.h            |    2 +-
 source/common/x86/pixel-a.asm         |   18 +-
 source/common/x86/pixel-util.h        |   12 +-
 source/common/x86/pixel-util8.asm     |  652 +++++++++++++--------------------
 source/encoder/compress.cpp           |   10 +-
 source/encoder/dpb.cpp                |   11 -
 source/encoder/dpb.h                  |    2 +-
 source/encoder/encoder.cpp            |    6 +
 source/encoder/frameencoder.cpp       |   32 +-
 source/encoder/frameencoder.h         |    3 +-
 source/test/pixelharness.cpp          |   41 +-
 16 files changed, 335 insertions(+), 490 deletions(-)

diffs (truncated from 1291 to 300 lines):

diff -r e03388e98ecc -r 8c946aca5824 .hgtags

--- a/.hgtags	Wed Apr 02 22:51:49 2014 -0500
+++ b/.hgtags	Thu Apr 03 14:49:57 2014 -0500
@@ -10,3 +10,4 @@ 69acb3cb777f977f5edde908069ac565915dd366
 b970ffbdd696e3ce45c93b315902eb6366ff085e 0.6
 d24e2a8c4326b0cd01bfa6c414c5378481af9018 0.7
 527d03c56d6860dc979ddea1196f7e94d13d3e82 0.8
+82bbd2bf3b49ba086be0f0922f91fe0084896351 0.9
diff -r e03388e98ecc -r 8c946aca5824 source/Lib/TLibCommon/TComDataCU.cpp
--- a/source/Lib/TLibCommon/TComDataCU.cpp	Wed Apr 02 22:51:49 2014 -0500
+++ b/source/Lib/TLibCommon/TComDataCU.cpp	Thu Apr 03 14:49:57 2014 -0500
@@ -91,8 +91,6 @@ TComDataCU::TComDataCU()
     m_cuAboveRight = NULL;
     m_cuAbove = NULL;
     m_cuLeft = NULL;
-    m_cuColocated[0] = NULL;
-    m_cuColocated[1] = NULL;
     m_mvpIdx[0] = NULL;
     m_mvpIdx[1] = NULL;
     m_chromaFormat = 0;
@@ -280,9 +278,6 @@ void TComDataCU::initCU(TComPic* pic, ui
     m_cuAboveLeft   = NULL;
     m_cuAboveRight  = NULL;
 
-    m_cuColocated[0] = NULL;
-    m_cuColocated[1] = NULL;
-
     uint32_t uiWidthInCU = pic->getFrameWidthInCU();
     if (m_cuAddr % uiWidthInCU)
     {
@@ -303,16 +298,6 @@ void TComDataCU::initCU(TComPic* pic, ui
     {
         m_cuAboveRight = pic->getCU(m_cuAddr - uiWidthInCU + 1);
     }
-
-    if (getSlice()->getNumRefIdx(REF_PIC_LIST_0) > 0)
-    {
-        m_cuColocated[0] = getSlice()->getRefPic(REF_PIC_LIST_0, 0)->getCU(m_cuAddr);
-    }
-
-    if (getSlice()->getNumRefIdx(REF_PIC_LIST_1) > 0)
-    {
-        m_cuColocated[1] = getSlice()->getRefPic(REF_PIC_LIST_1, 0)->getCU(m_cuAddr);
-    }
 }
 
 /** initialize prediction data with enabling sub-LCU-level delta QP
@@ -457,9 +442,6 @@ void TComDataCU::initSubCU(TComDataCU* c
     m_cuAbove       = cu->getCUAbove();
     m_cuAboveLeft   = cu->getCUAboveLeft();
     m_cuAboveRight  = cu->getCUAboveRight();
-
-    m_cuColocated[0] = cu->getCUColocated(REF_PIC_LIST_0);
-    m_cuColocated[1] = cu->getCUColocated(REF_PIC_LIST_1);
 }
 
 // initialize Sub partition
@@ -526,9 +508,6 @@ void TComDataCU::initSubCU(TComDataCU* c
     m_cuAbove       = cu->getCUAbove();
     m_cuAboveLeft   = cu->getCUAboveLeft();
     m_cuAboveRight  = cu->getCUAboveRight();
-
-    m_cuColocated[0] = cu->getCUColocated(REF_PIC_LIST_0);
-    m_cuColocated[1] = cu->getCUColocated(REF_PIC_LIST_1);
 }
 
 
@@ -620,9 +599,6 @@ void TComDataCU::copyPartFrom(TComDataCU
     m_cuAbove          = cu->getCUAbove();
     m_cuLeft           = cu->getCULeft();
 
-    m_cuColocated[0] = cu->getCUColocated(REF_PIC_LIST_0);
-    m_cuColocated[1] = cu->getCUColocated(REF_PIC_LIST_1);
-
     m_cuMvField[0].copyFrom(cu->getCUMvField(REF_PIC_LIST_0), cu->getTotalNumPart(), offset);
     m_cuMvField[1].copyFrom(cu->getCUMvField(REF_PIC_LIST_1), cu->getTotalNumPart(), offset);
 
diff -r e03388e98ecc -r 8c946aca5824 source/Lib/TLibCommon/TComDataCU.h
--- a/source/Lib/TLibCommon/TComDataCU.h	Wed Apr 02 22:51:49 2014 -0500
+++ b/source/Lib/TLibCommon/TComDataCU.h	Thu Apr 03 14:49:57 2014 -0500
@@ -129,7 +129,6 @@ private:
     TComDataCU*   m_cuAboveRight;    ///< pointer of above-right CU
     TComDataCU*   m_cuAbove;         ///< pointer of above CU
     TComDataCU*   m_cuLeft;          ///< pointer of left CU
-    TComDataCU*   m_cuColocated[2];  ///< pointer of temporally colocated CU's for both directions
 
     // -------------------------------------------------------------------------------------------------------------------
     // coding tool information
@@ -387,8 +386,6 @@ public:
 
     TComDataCU*   getCUAboveRight() { return m_cuAboveRight; }
 
-    TComDataCU*   getCUColocated(int picList) { return m_cuColocated[picList]; }
-
     TComDataCU*   getPULeft(uint32_t& lPartUnitIdx,
                             uint32_t  curPartUnitIdx,
                             bool      bEnforceSliceRestriction = true,
diff -r e03388e98ecc -r 8c946aca5824 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Wed Apr 02 22:51:49 2014 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Thu Apr 03 14:49:57 2014 -0500
@@ -465,7 +465,7 @@ void TEncSearch::xIntraCodingLumaBlk(TCo
 
     assert(width <= 32);
     //===== reconstruction =====
-    primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
+    primitives.calcrecon[size](pred, residual, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
     //===== update distortion =====
     outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc, stride);
 }
@@ -587,7 +587,7 @@ void TEncSearch::xIntraCodingChromaBlk(T
     assert(((intptr_t)residual & (width - 1)) == 0);
     assert(width <= 32);
     //===== reconstruction =====
-    primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
+    primitives.calcrecon[size](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
     //===== update distortion =====
     uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);
     if (ttype == TEXT_CHROMA_U)
diff -r e03388e98ecc -r 8c946aca5824 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Wed Apr 02 22:51:49 2014 -0500
+++ b/source/common/pixel.cpp	Thu Apr 03 14:49:57 2014 -0500
@@ -460,9 +460,7 @@ void getResidual(pixel *fenc, pixel *pre
 }
 
 template<int blockSize>
-void calcRecons(pixel* pred, int16_t* residual,
-                pixel*,
-                int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
+void calcRecons(pixel* pred, int16_t* residual, int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
 {
     for (int y = 0; y < blockSize; y++)
     {
diff -r e03388e98ecc -r 8c946aca5824 source/common/primitives.h
--- a/source/common/primitives.h	Wed Apr 02 22:51:49 2014 -0500
+++ b/source/common/primitives.h	Thu Apr 03 14:49:57 2014 -0500
@@ -125,7 +125,7 @@ typedef void (*cvt32to16_shr_t)(int16_t 
 typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
 typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
 typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
 typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
 typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
 typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
diff -r e03388e98ecc -r 8c946aca5824 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Wed Apr 02 22:51:49 2014 -0500
+++ b/source/common/x86/pixel-a.asm	Thu Apr 03 14:49:57 2014 -0500
@@ -6525,10 +6525,12 @@ cglobal upShift_8, 7,7,3
 .process2:
     cmp         r4d, 2
     jl          .process1
-    movd        m0,[r0]
-    pmovzxbw    m0,m0
-    psllw       m0, m2
-    movd        [r2], m0
+    movzx       r3d, byte [r0]
+    shl         r3d, 2
+    mov         [r2], r3w
+    movzx       r3d, byte [r0 + 1]
+    shl         r3d, 2
+    mov         [r2 + 2], r3w
 
     add         r0, 2
     add         r2, 4
@@ -6536,10 +6538,8 @@ cglobal upShift_8, 7,7,3
     jz          .end
 
 .process1:
-    movd        m0,[r0]
-    pmovzxbw    m0,m0
-    psllw       m0, m2
-    movd        r6, m0
-    mov         [r2], r6w
+    movzx       r3d, byte [r0]
+    shl         r3d, 2
+    mov         [r2], r3w
 .end:
     RET
diff -r e03388e98ecc -r 8c946aca5824 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Wed Apr 02 22:51:49 2014 -0500
+++ b/source/common/x86/pixel-util.h	Thu Apr 03 14:49:57 2014 -0500
@@ -24,12 +24,12 @@
 #ifndef X265_PIXEL_UTIL_H
 #define X265_PIXEL_UTIL_H
 
-void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
 
 void x265_getResidual4_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
 void x265_getResidual8_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
diff -r e03388e98ecc -r 8c946aca5824 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Apr 02 22:51:49 2014 -0500
+++ b/source/common/x86/pixel-util8.asm	Thu Apr 03 14:49:57 2014 -0500
@@ -58,590 +58,452 @@ cextern pw_2000
 cextern pw_pixel_max
 
 ;-----------------------------------------------------------------------------
-; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
+; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal calcRecons4
 %if HIGH_BIT_DEPTH
 %if ARCH_X86_64 == 1
-    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
-    PROLOGUE 6,9,6
+cglobal calcRecons4, 5,8,4
+    %define t7b     r7b
 %else
-    DECLARE_REG_TMP 0,1,2,3,4,5
-    PROLOGUE 6,7,6
-    %define t6      r6m
-    %define t6d     r6d
-    %define t7      r7m
-    %define t8d     r6d
+cglobal calcRecons4, 5,7,4,0-1
+    %define t7b     byte [rsp]
 %endif
-
-    mov         t6d, r6m
-%if ARCH_X86_64 == 0
-    add         t6d, t6d
-    mov         r6m, t6d
-%else
+    mov         r4d, r4m
     mov         r5d, r5m
-    mov         r7d, r7m
-    add         t6d, t6d
-    add         t7, t7
-%endif
+    mov         r6d, r6m
+    add         r4d, r4d
+    add         r5d, r5d
+    add         r6d, r6d
 
     pxor        m4, m4
     mova        m5, [pw_pixel_max]
-    add         t5, t5
-    mov         t8d, 4/2
+    mov         t7b, 4/2
 .loop:
-    movh        m0, [t0]
-    movh        m1, [t0 + t5]
+    movh        m0, [r0]
+    movh        m1, [r0 + r4]
     punpcklqdq  m0, m1
-    movh        m2, [t1]
-    movh        m3, [t1 + t5]
+    movh        m2, [r1]
+    movh        m3, [r1 + r4]
     punpcklqdq  m2, m3
     paddw       m0, m2
     CLIPW       m0, m4, m5
 
-    ; store recon[] and recipred[]
-    movh        [t4], m0
-%if ARCH_X86_64 == 0
-    add         t4, t7
-    add         t4, t7
-    movhps      [t4], m0
-    add         t4, t7
-    add         t4, t7
+    ; store recipred[]
+    movh        [r3], m0
+    movhps      [r3 + r6], m0
+
+    ; store recqt[]
+    movh        [r2], m0
+    movhps      [r2 + r5], m0
+
+    lea         r0, [r0 + r4 * 2]
+    lea         r1, [r1 + r4 * 2]
+    lea         r2, [r2 + r5 * 2]
+    lea         r3, [r3 + r6 * 2]
+
+    dec         t7b
+    jnz        .loop
+    RET
+%else          ;HIGH_BIT_DEPTH
+
+%if ARCH_X86_64 == 1
+cglobal calcRecons4, 5,8,4
+    %define t7b     r7b
 %else
-    movhps      [t4 + t7], m0
-    lea         t4, [t4 + t7 * 2]
+cglobal calcRecons4, 5,7,4,0-1
+    %define t7b     byte [rsp]
 %endif
-