[x265-commits] [x265] cmake: tweak order of compiles to improve parallel build ...

Steve Borho steve at borho.org
Tue Jan 14 06:01:53 CET 2014


details:   http://hg.videolan.org/x265/rev/3bc604fdd380
branches:  
changeset: 5821:3bc604fdd380
user:      Steve Borho <steve at borho.org>
date:      Mon Jan 13 11:50:22 2014 -0600
description:
cmake: tweak order of compiles to improve parallel build times

Move assembly and intrinsic files to front of the build
Subject: [x265] TComPicYuv: add a row and col of padding for lowres interpolation

details:   http://hg.videolan.org/x265/rev/8d3cdf1a846e
branches:  stable
changeset: 5822:8d3cdf1a846e
user:      Steve Borho <steve at borho.org>
date:      Mon Jan 13 15:58:06 2014 -0600
description:
TComPicYuv: add a row and col of padding for lowres interpolation
Subject: [x265] TComYuv: pad chroma allocations, fix valgrind warnings

details:   http://hg.videolan.org/x265/rev/aae31685d8c7
branches:  stable
changeset: 5823:aae31685d8c7
user:      Steve Borho <steve at borho.org>
date:      Mon Jan 13 19:46:00 2014 -0600
description:
TComYuv: pad chroma allocations, fix valgrind warnings
Subject: [x265] wavefront: consider enabled bitmap status in checkHigherPriorityRow()

details:   http://hg.videolan.org/x265/rev/8e0fa5fcbf15
branches:  stable
changeset: 5824:8e0fa5fcbf15
user:      Steve Borho <steve at borho.org>
date:      Mon Jan 13 16:05:56 2014 -0600
description:
wavefront: consider enabled bitmap status in checkHigherPriorityRow()

diffstat:

 source/Lib/TLibCommon/TComPicYuv.cpp |   4 +
 source/Lib/TLibCommon/TComYuv.cpp    |   6 +-
 source/common/wavefront.cpp          |   4 +-
 source/common/x86/asm-primitives.cpp |  10 +---
 source/common/x86/intrapred.h        |  38 +--------------
 source/common/x86/intrapred8.asm     |  76 -----------------------------
 source/common/x86/ipfilter8.asm      |  94 ++++++++---------------------------
 7 files changed, 33 insertions(+), 199 deletions(-)

diffs (truncated from 339 to 300 lines):

diff -r 5a607dd446ea -r 8e0fa5fcbf15 source/Lib/TLibCommon/TComPicYuv.cpp
--- a/source/Lib/TLibCommon/TComPicYuv.cpp	Mon Jan 13 11:01:16 2014 -0600
+++ b/source/Lib/TLibCommon/TComPicYuv.cpp	Mon Jan 13 16:05:56 2014 -0600
@@ -214,6 +214,10 @@ void TComPicYuv::copyFromPicture(const x
     rem = height & 15;
     pady = rem ? 16 - rem : pady;
 
+    /* add one more row and col of pad for downscale interpolation, fixes
+     * warnings from valgrind about using uninitialized pixels */
+    padx++; pady++;
+
 #if HIGH_BIT_DEPTH
     if (pic.bitDepth > 8)
     {
diff -r 5a607dd446ea -r 8e0fa5fcbf15 source/Lib/TLibCommon/TComYuv.cpp
--- a/source/Lib/TLibCommon/TComYuv.cpp	Mon Jan 13 11:01:16 2014 -0600
+++ b/source/Lib/TLibCommon/TComYuv.cpp	Mon Jan 13 16:05:56 2014 -0600
@@ -67,10 +67,10 @@ void TComYuv::create(uint32_t width, uin
     m_hChromaShift = CHROMA_H_SHIFT(csp);
     m_vChromaShift = CHROMA_V_SHIFT(csp);
 
-    // memory allocation
+    // memory allocation (padded for SIMD reads)
     m_bufY = (Pel*)X265_MALLOC(Pel, width * height);
-    m_bufU = (Pel*)X265_MALLOC(Pel, (width >> m_hChromaShift) * (height >> m_vChromaShift));
-    m_bufV = (Pel*)X265_MALLOC(Pel, (width >> m_hChromaShift) * (height >> m_vChromaShift));
+    m_bufU = (Pel*)X265_MALLOC(Pel, (width >> m_hChromaShift) * (height >> m_vChromaShift) + 8);
+    m_bufV = (Pel*)X265_MALLOC(Pel, (width >> m_hChromaShift) * (height >> m_vChromaShift) + 8);
 
     // set width and height
     m_width   = width;
diff -r 5a607dd446ea -r 8e0fa5fcbf15 source/common/wavefront.cpp
--- a/source/common/wavefront.cpp	Mon Jan 13 11:01:16 2014 -0600
+++ b/source/common/wavefront.cpp	Mon Jan 13 16:05:56 2014 -0600
@@ -95,12 +95,12 @@ bool WaveFront::checkHigherPriorityRow(i
     // Check full bitmap words before curRow
     for (int i = 0; i < fullwords; i++)
     {
-        if (m_queuedBitmap[i])
+        if (m_queuedBitmap[i] & m_enableBitmap[i])
             return true;
     }
 
     // check the partially masked bitmap word of curRow
-    if (m_queuedBitmap[fullwords] & mask)
+    if (m_queuedBitmap[fullwords] & m_enableBitmap[i] & mask)
         return true;
     return false;
 }
diff -r 5a607dd446ea -r 8e0fa5fcbf15 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jan 13 11:01:16 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp	Mon Jan 13 16:05:56 2014 -0600
@@ -549,13 +549,10 @@ extern "C" {
 
 #define SETUP_INTRA_ANG4(mode, fno, cpu) \
     p.intra_pred[BLOCK_4x4][mode] = x265_intra_pred_ang4_ ## fno ## _ ## cpu;
-#define SETUP_INTRA_ANG8(mode, fno, cpu) \
-    p.intra_pred[BLOCK_8x8][mode] = x265_intra_pred_ang8_ ## fno ## _ ## cpu;
-#define SETUP_INTRA_ANG16(mode, fno, cpu) \
-    p.intra_pred[BLOCK_16x16][mode] = x265_intra_pred_ang16_ ## fno ## _ ## cpu;
 
 namespace x265 {
 // private x265 namespace
+
 void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
 {
 #if HIGH_BIT_DEPTH
@@ -892,13 +889,10 @@ void Setup_Assembly_Primitives(EncoderPr
 
         SETUP_INTRA_ANG4(2, 2, ssse3);
         SETUP_INTRA_ANG4(34, 2, ssse3);
-        SETUP_INTRA_ANG8(2, 2, ssse3);
-        SETUP_INTRA_ANG8(34, 2, ssse3);
-        SETUP_INTRA_ANG16(2, 2, ssse3);
-        SETUP_INTRA_ANG16(34, 2, ssse3);
 
         p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
         p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
+
         SAD_X3(ssse3);
         SAD_X4(ssse3);
         p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
diff -r 5a607dd446ea -r 8e0fa5fcbf15 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Mon Jan 13 11:01:16 2014 -0600
+++ b/source/common/x86/intrapred.h	Mon Jan 13 16:05:56 2014 -0600
@@ -57,45 +57,9 @@ DECL_ANG(4, 16, sse4);
 DECL_ANG(4, 17, sse4);
 DECL_ANG(4, 18, sse4);
 DECL_ANG(4, 26, sse4);
-DECL_ANG(8, 2, ssse3);
-DECL_ANG(8, 3, sse4);
-DECL_ANG(8, 4, sse4);
-DECL_ANG(8, 5, sse4);
-DECL_ANG(8, 6, sse4);
-DECL_ANG(8, 7, sse4);
-DECL_ANG(8, 8, sse4);
-DECL_ANG(8, 9, sse4);
-DECL_ANG(8, 10, sse4);
-DECL_ANG(8, 11, sse4);
-DECL_ANG(8, 12, sse4);
-DECL_ANG(8, 13, sse4);
-DECL_ANG(8, 14, sse4);
-DECL_ANG(8, 15, sse4);
-DECL_ANG(8, 16, sse4);
-DECL_ANG(8, 17, sse4);
-DECL_ANG(8, 18, sse4);
-DECL_ANG(8, 26, sse4);
-
-DECL_ANG(16, 2, ssse3);
-DECL_ANG(16, 3, sse4);
-DECL_ANG(16, 4, sse4);
-DECL_ANG(16, 5, sse4);
-DECL_ANG(16, 6, sse4);
-DECL_ANG(16, 7, sse4);
-DECL_ANG(16, 8, sse4);
-DECL_ANG(16, 9, sse4);
-DECL_ANG(16, 10, sse4);
-DECL_ANG(16, 11, sse4);
-DECL_ANG(16, 12, sse4);
-DECL_ANG(16, 13, sse4);
-DECL_ANG(16, 14, sse4);
-DECL_ANG(16, 15, sse4);
-DECL_ANG(16, 16, sse4);
-DECL_ANG(16, 17, sse4);
-DECL_ANG(16, 18, sse4);
-DECL_ANG(16, 26, sse4);
 
 #undef DECL_ANG
+
 void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);
 void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);
 
diff -r 5a607dd446ea -r 8e0fa5fcbf15 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Mon Jan 13 11:01:16 2014 -0600
+++ b/source/common/x86/intrapred8.asm	Mon Jan 13 16:05:56 2014 -0600
@@ -1105,82 +1105,6 @@ cglobal intra_pred_ang4_18, 4,4,1
     psrldq      m0, 1
     movd        [r0], m0
     RET
-;-----------------------------------------------------------------------------
-; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang8_2, 3,5,2
-    cmp         r4m,            byte 34
-    cmove       r2,             r3mp
-    movu        m0,             [r2 + 2]
-    lea         r4,             [r1 * 3]
-
-    movh        [r0],           m0
-    palignr     m1,             m0, 1
-    movh        [r0 + r1],      m1
-    palignr     m1,             m0, 2
-    movh        [r0 + r1 * 2],  m1
-    palignr     m1,             m0, 3
-    movh        [r0 + r4],      m1
-    palignr     m1,             m0, 4
-    lea         r0,             [r0 + r1 * 4]
-    movh        [r0],           m1
-    palignr     m1,             m0, 5
-    movh        [r0 + r1],      m1
-    palignr     m1,             m0, 6
-    movh        [r0 + r1 * 2],  m1
-    palignr     m1,             m0, 7
-    movh        [r0 + r4],      m1
-    RET
-
-;-----------------------------------------------------------------------------
-; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang16_2, 3,3,3
-    cmp             r4m, byte 34
-    cmove           r2, r3mp
-    movu            m0, [r2 + 2]
-    movu            m1, [r2 + 18]
-    movu            [r0], m0
-    palignr         m2, m1, m0, 1
-    movu            [r0 + r1], m2
-    lea             r0, [r0 + r1 * 2]
-    palignr         m2, m1, m0, 2
-    movu            [r0], m2
-    palignr         m2, m1, m0, 3
-    movu            [r0 + r1], m2
-    lea             r0, [r0 + r1 * 2]
-    palignr         m2, m1, m0, 4
-    movu            [r0], m2
-    palignr         m2, m1, m0, 5
-    movu            [r0 + r1], m2
-    lea             r0, [r0 + r1 * 2]
-    palignr         m2, m1, m0, 6
-    movu            [r0], m2
-    palignr         m2, m1, m0, 7
-    movu            [r0 + r1], m2
-    lea             r0, [r0 + r1 * 2]
-    palignr         m2, m1, m0, 8
-    movu            [r0], m2
-    palignr         m2, m1, m0, 9
-    movu            [r0 + r1], m2
-    lea             r0, [r0 + r1 * 2]
-    palignr         m2, m1, m0, 10
-    movu            [r0], m2
-    palignr         m2, m1, m0, 11
-    movu            [r0 + r1], m2
-    lea             r0, [r0 + r1 * 2]
-    palignr         m2, m1, m0, 12
-    movu            [r0], m2
-    palignr         m2, m1, m0, 13
-    movu            [r0 + r1], m2
-    lea             r0, [r0 + r1 * 2]
-    palignr         m2, m1, m0, 14
-    movu            [r0], m2
-    palignr         m2, m1, m0, 15
-    movu            [r0 + r1], m2
-    RET
 
 ;-----------------------------------------------------------------------------
 ; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
diff -r 5a607dd446ea -r 8e0fa5fcbf15 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Mon Jan 13 11:01:16 2014 -0600
+++ b/source/common/x86/ipfilter8.asm	Mon Jan 13 16:05:56 2014 -0600
@@ -29,7 +29,6 @@
 SECTION_RODATA 32
 tab_Tm:    db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
            db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
-           db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
 
 tab_Lm:    db 0, 1, 2, 3, 4,  5,  6,  7,  1, 2, 3, 4,  5,  6,  7,  8
            db 2, 3, 4, 5, 6,  7,  8,  9,  3, 4, 5, 6,  7,  8,  9,  10
@@ -128,7 +127,6 @@ tab_c_64_n64:   times 8 db 64, -64
 
 SECTION .text
 
-cextern pw_512
 cextern pw_2000
 
 %macro FILTER_H4_w2_2 3
@@ -690,80 +688,30 @@ cglobal interp_8tap_horiz_%3_%1x%2, 4,7,
 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;--------------------------------------------------------------------------------------------------------------
     IPFILTER_LUMA 4, 4, pp
+    IPFILTER_LUMA 8, 8, pp
+    IPFILTER_LUMA 8, 4, pp
     IPFILTER_LUMA 4, 8, pp
+    IPFILTER_LUMA 16, 16, pp
+    IPFILTER_LUMA 16, 8, pp
+    IPFILTER_LUMA 8, 16, pp
+    IPFILTER_LUMA 16, 12, pp
     IPFILTER_LUMA 12, 16, pp
+    IPFILTER_LUMA 16, 4, pp
     IPFILTER_LUMA 4, 16, pp
-
-
-;--------------------------------------------------------------------------------------------------------------
-; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;--------------------------------------------------------------------------------------------------------------
-%macro IPFILTER_LUMA_PP_W8 2
-INIT_XMM sse4
-cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7
-    mov         r4d, r4m
-
-%ifdef PIC
-    lea         r5, [tab_LumaCoeff]
-    movh        m3, [r5 + r4 * 8]
-%else
-    movh        m3, [tab_LumaCoeff + r4 * 8]
-%endif
-    pshufd      m0, m3, 0                       ; m0 = coeff-L
-    pshufd      m1, m3, 0x55                    ; m1 = coeff-H
-    lea         r5, [tab_Tm]                    ; r5 = shuffle
-    mova        m2, [pw_512]                    ; m2 = 512
-
-    mov         r4d, %2
-.loopH
-%assign x 0
-%rep %1 / 8
-    movu        m3, [r0 - 3 + x]                ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0]
-    pshufb      m4, m3, [r5 + 0*16]             ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0]
-    pshufb      m5, m3, [r5 + 1*16]             ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4]
-    pshufb          m3, [r5 + 2*16]             ; m3 = [E D C B D C B A C B A 9 B A 9 8]
-    pmaddubsw   m4, m0
-    pmaddubsw   m6, m5, m1
-    pmaddubsw   m5, m0
-    pmaddubsw   m3, m1
-    paddw       m4, m6
-    paddw       m5, m3
-    phaddw      m4, m5
-    pmulhrsw    m4, m2
-    packuswb    m4, m4
-    movh        [r2 + x], m4
-%assign x x+8
-%endrep
-
-    add       r0, r1
-    add       r2, r3
-
-    dec       r4d
-    jnz      .loopH
-    RET
-%endmacro


More information about the x265-commits mailing list