[x265-commits] [x265] sao: created new primitive for saoCuStatsE1

Dnyaneshwar G dnyaneshwar at multicorewareinc.com
Mon Jul 6 17:48:40 CEST 2015


details:   http://hg.videolan.org/x265/rev/81e30626b711
branches:  
changeset: 10764:81e30626b711
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Wed Jul 01 16:49:24 2015 +0530
description:
sao: created new primitive for saoCuStatsE1
Subject: [x265] sao: created new primitive for saoCuStatsE0

details:   http://hg.videolan.org/x265/rev/eca6c4408c90
branches:  
changeset: 10765:eca6c4408c90
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Wed Jul 01 16:50:32 2015 +0530
description:
sao: created new primitive for saoCuStatsE0
Subject: [x265] sao: created new primitive for saoCuStatsBO

details:   http://hg.videolan.org/x265/rev/54a45f1f7469
branches:  
changeset: 10766:54a45f1f7469
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Wed Jul 01 16:51:20 2015 +0530
description:
sao: created new primitive for saoCuStatsBO
Subject: [x265] asm: avx2 code for weight_sp() 16bpp

details:   http://hg.videolan.org/x265/rev/2fab331dd158
branches:  
changeset: 10767:2fab331dd158
user:      Aasaipriya Chandran <aasaipriya at multicorewareinc.com>
date:      Mon Jun 29 12:49:55 2015 +0530
description:
asm: avx2 code for weight_sp() 16bpp

 avx2: weight_sp  11.37x   4496.63         51139.20
 sse4: weight_sp  6.48x    8163.87         52870.36
Subject: [x265] motion: fix overflow in mvcost() check failure.

details:   http://hg.videolan.org/x265/rev/bf57ce5d38d5
branches:  
changeset: 10768:bf57ce5d38d5
user:      Divya Manivannan <divya at multicorewareinc.com>
date:      Fri Jul 03 19:50:02 2015 +0530
description:
motion: fix overflow in mvcost() check failure.

mvcost() funtion always truncate the value to 16 bits but there is a possiblity
of overflow in the mvcost() check failure condition given.

This check can be done in the cost calculation (s_costs) initially andso it is
removed here.
Subject: [x265] Merge

details:   http://hg.videolan.org/x265/rev/aade1fffa9bd
branches:  
changeset: 10769:aade1fffa9bd
user:      Steve Borho <steve at borho.org>
date:      Mon Jul 06 10:48:21 2015 -0500
description:
Merge

diffstat:

 build/msys/multilib.sh               |    2 +-
 source/common/primitives.h           |    6 +
 source/common/x86/asm-primitives.cpp |    1 +
 source/common/x86/pixel-util8.asm    |  126 +++++++++++++++++++++++-
 source/encoder/motion.cpp            |   40 ++-----
 source/encoder/sao.cpp               |  185 +++++++++++++++++-----------------
 6 files changed, 236 insertions(+), 124 deletions(-)

diffs (truncated from 516 to 300 lines):

diff -r 1162fb0b99f8 -r aade1fffa9bd build/msys/multilib.sh
--- a/build/msys/multilib.sh	Fri Jul 03 13:43:47 2015 -0500
+++ b/build/msys/multilib.sh	Mon Jul 06 10:48:21 2015 -0500
@@ -13,5 +13,5 @@ make ${MAKEFLAGS}
 cp libx265.a ../8bit/libx265_main10.a
 
 cd ../8bit
-cmake -G "MSYS Makefiles" ../../../source -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=ON -DEXTRA_LIB="x265_main10.a x265_main12.a" -DEXTRA_LINK_FLAGS=-L.
+cmake -G "MSYS Makefiles" ../../../source -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=ON -DEXTRA_LIB="x265_main10.a;x265_main12.a" -DEXTRA_LINK_FLAGS=-L.
 make ${MAKEFLAGS}
diff -r 1162fb0b99f8 -r aade1fffa9bd source/common/primitives.h
--- a/source/common/primitives.h	Fri Jul 03 13:43:47 2015 -0500
+++ b/source/common/primitives.h	Mon Jul 06 10:48:21 2015 -0500
@@ -174,6 +174,9 @@ typedef void (*saoCuOrgE2_t)(pixel* rec,
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 
+typedef void (*saoCuStatsBO_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 
@@ -297,6 +300,9 @@ struct EncoderPrimitives
     saoCuOrgE3_t          saoCuOrgE3[2];
     saoCuOrgB0_t          saoCuOrgB0;
 
+    saoCuStatsBO_t        saoCuStatsBO;
+    saoCuStatsE0_t        saoCuStatsE0;
+    saoCuStatsE1_t        saoCuStatsE1;
     saoCuStatsE2_t        saoCuStatsE2;
     saoCuStatsE3_t        saoCuStatsE3;
 
diff -r 1162fb0b99f8 -r aade1fffa9bd source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Jul 03 13:43:47 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Mon Jul 06 10:48:21 2015 -0500
@@ -1524,6 +1524,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
         p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
         p.weight_pp = PFX(weight_pp_avx2);
+        p.weight_sp = PFX(weight_sp_avx2);
         p.sign = PFX(calSign_avx2);
         p.planecopy_cp = PFX(upShift_8_avx2);
 
diff -r 1162fb0b99f8 -r aade1fffa9bd source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Fri Jul 03 13:43:47 2015 -0500
+++ b/source/common/x86/pixel-util8.asm	Mon Jul 06 10:48:21 2015 -0500
@@ -1674,8 +1674,128 @@ cglobal weight_sp, 6, 7, 7, 0-(2*4)
     dec         r5d
     jnz         .loopH
     RET
-
-%if ARCH_X86_64
+%endif
+
+
+%if HIGH_BIT_DEPTH
+INIT_YMM avx2
+cglobal weight_sp, 6,7,9
+    mova                      m1, [pw_1023]
+    mova                      m2, [pw_1]
+    mov                       r6d, r7m
+    shl                       r6d, 16
+    or                        r6d, r6m
+    vpbroadcastd              m3, r6d      ; m3 = [round w0]
+    movd                      xm4, r8m     ; m4 = [shift]
+    vpbroadcastd              m5, r9m      ; m5 = [offset]
+
+    ; correct row stride
+    add                       r3d, r3d
+    add                       r2d, r2d
+    mov                       r6d, r4d
+    and                       r6d, ~(mmsize / SIZEOF_PIXEL - 1)
+    sub                       r3d, r6d
+    sub                       r3d, r6d
+    sub                       r2d, r6d
+    sub                       r2d, r6d
+
+    ; generate partial width mask (MUST BE IN YMM0)
+    mov                       r6d, r4d
+    and                       r6d, (mmsize / SIZEOF_PIXEL - 1)
+    movd                      xm0, r6d
+    pshuflw                   m0, m0, 0
+    punpcklqdq                m0, m0
+    vinserti128               m0, m0, xm0, 1
+    pcmpgtw                   m0, [pw_0_15]
+
+.loopH:
+    mov                       r6d, r4d
+
+.loopW:
+    movu                      m6, [r0]
+    paddw                     m6, [pw_2000]
+
+    punpcklwd                 m7, m6, m2
+    pmaddwd                   m7, m3       ;(round w0)
+    psrad                     m7, xm4      ;(shift)
+    paddd                     m7, m5       ;(offset)
+
+    punpckhwd                 m6, m2
+    pmaddwd                   m6, m3
+    psrad                     m6, xm4
+    paddd                     m6, m5
+
+    packusdw                  m7, m6
+    pminuw                    m7, m1
+
+    sub                       r6d, (mmsize / SIZEOF_PIXEL)
+    jl                        .width14
+    movu                      [r1], m7
+    lea                       r0, [r0 + mmsize]
+    lea                       r1, [r1 + mmsize]
+    je                        .nextH
+    jmp                       .loopW
+
+.width14:
+    add                       r6d, 16
+    cmp                       r6d, 14
+    jl                        .width12
+    movu                      [r1], xm7
+    vextracti128              xm8, m7, 1
+    movq                      [r1 + 16], xm8
+    pextrd                    [r1 + 24], xm8, 2
+    je                        .nextH
+
+.width12:
+    cmp                       r6d, 12
+    jl                        .width10
+    movu                      [r1], xm7
+    vextracti128              xm8, m7, 1
+    movq                      [r1 + 16], xm8
+    je                        .nextH
+
+.width10:
+    cmp                       r6d, 10
+    jl                        .width8
+    movu                      [r1], xm7
+    vextracti128              xm8, m7, 1
+    movd                      [r1 + 16], xm8
+    je                        .nextH
+
+.width8:
+    cmp                       r6d, 8
+    jl                        .width6
+    movu                      [r1], xm7
+    je                        .nextH
+
+.width6
+    cmp                       r6d, 6
+    jl                        .width4
+    movq                      [r1], xm7
+    pextrd                    [r1 + 8], xm7, 2
+    je                        .nextH
+
+.width4:
+    cmp                       r6d, 4
+    jl                        .width2
+    movq                      [r1], xm7
+    je                        .nextH
+    add                       r1, 4
+    pshufd                    m6, m6, 1
+    je                        .nextH
+
+.width2:
+    movd                      [r1], xm7
+
+.nextH:
+    add                       r0, r2
+    add                       r1, r3
+
+    dec                       r5d
+    jnz                       .loopH
+    RET
+
+%else
 INIT_YMM avx2
 cglobal weight_sp, 6, 9, 7
     mov             r7d, r7m
@@ -1752,8 +1872,6 @@ cglobal weight_sp, 6, 9, 7
     jnz             .loopH
     RET
 %endif
-%endif  ; end of (HIGH_BIT_DEPTH == 0)
-    
 
 ;-----------------------------------------------------------------
 ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
diff -r 1162fb0b99f8 -r aade1fffa9bd source/encoder/motion.cpp
--- a/source/encoder/motion.cpp	Fri Jul 03 13:43:47 2015 -0500
+++ b/source/encoder/motion.cpp	Mon Jul 06 10:48:21 2015 -0500
@@ -234,14 +234,9 @@ void MotionEstimate::setSourcePU(const Y
                pix_base + (m1x) + (m1y) * stride, \
                pix_base + (m2x) + (m2y) * stride, \
                stride, costs); \
-        const uint16_t *base_mvx = &m_cost_mvx[(bmv.x + (m0x)) << 2]; \
-        const uint16_t *base_mvy = &m_cost_mvy[(bmv.y + (m0y)) << 2]; \
-        X265_CHECK(mvcost((bmv + MV(m0x, m0y)) << 2) == (base_mvx[((m0x) - (m0x)) << 2] + base_mvy[((m0y) - (m0y)) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((bmv + MV(m1x, m1y)) << 2) == (base_mvx[((m1x) - (m0x)) << 2] + base_mvy[((m1y) - (m0y)) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((bmv + MV(m2x, m2y)) << 2) == (base_mvx[((m2x) - (m0x)) << 2] + base_mvy[((m2y) - (m0y)) << 2]), "mvcost() check failure\n"); \
-        (costs)[0] += (base_mvx[((m0x) - (m0x)) << 2] + base_mvy[((m0y) - (m0y)) << 2]); \
-        (costs)[1] += (base_mvx[((m1x) - (m0x)) << 2] + base_mvy[((m1y) - (m0y)) << 2]); \
-        (costs)[2] += (base_mvx[((m2x) - (m0x)) << 2] + base_mvy[((m2y) - (m0y)) << 2]); \
+        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
+        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
+        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
     }
 
 #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
@@ -271,16 +266,10 @@ void MotionEstimate::setSourcePU(const Y
                pix_base + (m2x) + (m2y) * stride, \
                pix_base + (m3x) + (m3y) * stride, \
                stride, costs); \
-        const uint16_t *base_mvx = &m_cost_mvx[(omv.x << 2)]; \
-        const uint16_t *base_mvy = &m_cost_mvy[(omv.y << 2)]; \
-        X265_CHECK(mvcost((omv + MV(m0x, m0y)) << 2) == (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((omv + MV(m1x, m1y)) << 2) == (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((omv + MV(m2x, m2y)) << 2) == (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((omv + MV(m3x, m3y)) << 2) == (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]), "mvcost() check failure\n"); \
-        costs[0] += (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]); \
-        costs[1] += (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]); \
-        costs[2] += (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]); \
-        costs[3] += (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]); \
+        costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
+        costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
+        costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
+        costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
         COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
         COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
         COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
@@ -296,17 +285,10 @@ void MotionEstimate::setSourcePU(const Y
                pix_base + (m2x) + (m2y) * stride, \
                pix_base + (m3x) + (m3y) * stride, \
                stride, costs); \
-        /* TODO: use restrict keyword in ICL */ \
-        const uint16_t *base_mvx = &m_cost_mvx[(bmv.x << 2)]; \
-        const uint16_t *base_mvy = &m_cost_mvy[(bmv.y << 2)]; \
-        X265_CHECK(mvcost((bmv + MV(m0x, m0y)) << 2) == (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((bmv + MV(m1x, m1y)) << 2) == (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((bmv + MV(m2x, m2y)) << 2) == (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]), "mvcost() check failure\n"); \
-        X265_CHECK(mvcost((bmv + MV(m3x, m3y)) << 2) == (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]), "mvcost() check failure\n"); \
-        (costs)[0] += (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]); \
-        (costs)[1] += (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]); \
-        (costs)[2] += (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]); \
-        (costs)[3] += (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]); \
+        (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
+        (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
+        (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
+        (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
     }
 
 #define DIA1_ITER(mx, my) \
diff -r 1162fb0b99f8 -r aade1fffa9bd source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Fri Jul 03 13:43:47 2015 -0500
+++ b/source/encoder/sao.cpp	Mon Jul 06 10:48:21 2015 -0500
@@ -671,7 +671,6 @@ void SAO::copySaoUnit(SaoCtuParam* saoUn
 /* Calculate SAO statistics for current CTU without non-crossing slice */
 void SAO::calcSaoStatsCu(int addr, int plane)
 {
-    int x, y;
     const CUData* cu = m_frame->m_encData->getPicCTU(addr);
     const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
     const pixel* rec0  = m_frame->m_reconPic->getPlaneAddr(plane, addr);
@@ -702,8 +701,6 @@ void SAO::calcSaoStatsCu(int addr, int p
     int startY;
     int endX;
     int endY;
-    int32_t* stats;
-    int32_t* count;
 
     int skipB = plane ? 2 : 4;
     int skipR = plane ? 3 : 5;
@@ -711,41 +708,18 @@ void SAO::calcSaoStatsCu(int addr, int p
     int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
     int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
 
-    // Dynamic Range: 64x64x14bpp = 24bits
-    int32_t tmp_stats[NUM_EDGETYPE];
-    // TODO: improve by uint64_t, but need Haswell SHLX
-    uint16_t tmp_count[NUM_EDGETYPE];
-
     // SAO_BO:
     {
-        const int boShift = X265_DEPTH - SAO_BO_BITS;
-
         if (m_param->bSaoNonDeblocked)
         {
             skipB = plane ? 1 : 3;
             skipR = plane ? 2 : 4;
         }
-        stats = m_offsetOrg[plane][SAO_BO];
-        count = m_count[plane][SAO_BO];
-
-        fenc = fenc0;
-        rec  = rec0;
 
         endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
         endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
 
-        for (y = 0; y < endY; y++)
-        {


More information about the x265-commits mailing list