[x265-commits] [x265] cmake: tweak order of compiles to improve parallel build ...
Steve Borho
steve at borho.org
Tue Jan 14 06:01:53 CET 2014
details: http://hg.videolan.org/x265/rev/3bc604fdd380
branches:
changeset: 5821:3bc604fdd380
user: Steve Borho <steve at borho.org>
date: Mon Jan 13 11:50:22 2014 -0600
description:
cmake: tweak order of compiles to improve parallel build times
Move assembly and intrinsic files to front of the build
Subject: [x265] TComPicYuv: add a row and col of padding for lowres interpolation
details: http://hg.videolan.org/x265/rev/8d3cdf1a846e
branches: stable
changeset: 5822:8d3cdf1a846e
user: Steve Borho <steve at borho.org>
date: Mon Jan 13 15:58:06 2014 -0600
description:
TComPicYuv: add a row and col of padding for lowres interpolation
Subject: [x265] TComYuv: pad chroma allocations, fix valgrind warnings
details: http://hg.videolan.org/x265/rev/aae31685d8c7
branches: stable
changeset: 5823:aae31685d8c7
user: Steve Borho <steve at borho.org>
date: Mon Jan 13 19:46:00 2014 -0600
description:
TComYuv: pad chroma allocations, fix valgrind warnings
Subject: [x265] wavefront: consider enabled bitmap status in checkHigherPriorityRow()
details: http://hg.videolan.org/x265/rev/8e0fa5fcbf15
branches: stable
changeset: 5824:8e0fa5fcbf15
user: Steve Borho <steve at borho.org>
date: Mon Jan 13 16:05:56 2014 -0600
description:
wavefront: consider enabled bitmap status in checkHigherPriorityRow()
diffstat:
source/Lib/TLibCommon/TComPicYuv.cpp | 4 +
source/Lib/TLibCommon/TComYuv.cpp | 6 +-
source/common/wavefront.cpp | 4 +-
source/common/x86/asm-primitives.cpp | 10 +---
source/common/x86/intrapred.h | 38 +--------------
source/common/x86/intrapred8.asm | 76 -----------------------------
source/common/x86/ipfilter8.asm | 94 ++++++++---------------------------
7 files changed, 33 insertions(+), 199 deletions(-)
diffs (truncated from 339 to 300 lines):
diff -r 5a607dd446ea -r 8e0fa5fcbf15 source/Lib/TLibCommon/TComPicYuv.cpp
--- a/source/Lib/TLibCommon/TComPicYuv.cpp Mon Jan 13 11:01:16 2014 -0600
+++ b/source/Lib/TLibCommon/TComPicYuv.cpp Mon Jan 13 16:05:56 2014 -0600
@@ -214,6 +214,10 @@ void TComPicYuv::copyFromPicture(const x
rem = height & 15;
pady = rem ? 16 - rem : pady;
+ /* add one more row and col of pad for downscale interpolation, fixes
+ * warnings from valgrind about using uninitialized pixels */
+ padx++; pady++;
+
#if HIGH_BIT_DEPTH
if (pic.bitDepth > 8)
{
diff -r 5a607dd446ea -r 8e0fa5fcbf15 source/Lib/TLibCommon/TComYuv.cpp
--- a/source/Lib/TLibCommon/TComYuv.cpp Mon Jan 13 11:01:16 2014 -0600
+++ b/source/Lib/TLibCommon/TComYuv.cpp Mon Jan 13 16:05:56 2014 -0600
@@ -67,10 +67,10 @@ void TComYuv::create(uint32_t width, uin
m_hChromaShift = CHROMA_H_SHIFT(csp);
m_vChromaShift = CHROMA_V_SHIFT(csp);
- // memory allocation
+ // memory allocation (padded for SIMD reads)
m_bufY = (Pel*)X265_MALLOC(Pel, width * height);
- m_bufU = (Pel*)X265_MALLOC(Pel, (width >> m_hChromaShift) * (height >> m_vChromaShift));
- m_bufV = (Pel*)X265_MALLOC(Pel, (width >> m_hChromaShift) * (height >> m_vChromaShift));
+ m_bufU = (Pel*)X265_MALLOC(Pel, (width >> m_hChromaShift) * (height >> m_vChromaShift) + 8);
+ m_bufV = (Pel*)X265_MALLOC(Pel, (width >> m_hChromaShift) * (height >> m_vChromaShift) + 8);
// set width and height
m_width = width;
diff -r 5a607dd446ea -r 8e0fa5fcbf15 source/common/wavefront.cpp
--- a/source/common/wavefront.cpp Mon Jan 13 11:01:16 2014 -0600
+++ b/source/common/wavefront.cpp Mon Jan 13 16:05:56 2014 -0600
@@ -95,12 +95,12 @@ bool WaveFront::checkHigherPriorityRow(i
// Check full bitmap words before curRow
for (int i = 0; i < fullwords; i++)
{
- if (m_queuedBitmap[i])
+ if (m_queuedBitmap[i] & m_enableBitmap[i])
return true;
}
// check the partially masked bitmap word of curRow
- if (m_queuedBitmap[fullwords] & mask)
+ if (m_queuedBitmap[fullwords] & m_enableBitmap[i] & mask)
return true;
return false;
}
diff -r 5a607dd446ea -r 8e0fa5fcbf15 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 13 11:01:16 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 13 16:05:56 2014 -0600
@@ -549,13 +549,10 @@ extern "C" {
#define SETUP_INTRA_ANG4(mode, fno, cpu) \
p.intra_pred[BLOCK_4x4][mode] = x265_intra_pred_ang4_ ## fno ## _ ## cpu;
-#define SETUP_INTRA_ANG8(mode, fno, cpu) \
- p.intra_pred[BLOCK_8x8][mode] = x265_intra_pred_ang8_ ## fno ## _ ## cpu;
-#define SETUP_INTRA_ANG16(mode, fno, cpu) \
- p.intra_pred[BLOCK_16x16][mode] = x265_intra_pred_ang16_ ## fno ## _ ## cpu;
namespace x265 {
// private x265 namespace
+
void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
{
#if HIGH_BIT_DEPTH
@@ -892,13 +889,10 @@ void Setup_Assembly_Primitives(EncoderPr
SETUP_INTRA_ANG4(2, 2, ssse3);
SETUP_INTRA_ANG4(34, 2, ssse3);
- SETUP_INTRA_ANG8(2, 2, ssse3);
- SETUP_INTRA_ANG8(34, 2, ssse3);
- SETUP_INTRA_ANG16(2, 2, ssse3);
- SETUP_INTRA_ANG16(34, 2, ssse3);
p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
+
SAD_X3(ssse3);
SAD_X4(ssse3);
p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
diff -r 5a607dd446ea -r 8e0fa5fcbf15 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Mon Jan 13 11:01:16 2014 -0600
+++ b/source/common/x86/intrapred.h Mon Jan 13 16:05:56 2014 -0600
@@ -57,45 +57,9 @@ DECL_ANG(4, 16, sse4);
DECL_ANG(4, 17, sse4);
DECL_ANG(4, 18, sse4);
DECL_ANG(4, 26, sse4);
-DECL_ANG(8, 2, ssse3);
-DECL_ANG(8, 3, sse4);
-DECL_ANG(8, 4, sse4);
-DECL_ANG(8, 5, sse4);
-DECL_ANG(8, 6, sse4);
-DECL_ANG(8, 7, sse4);
-DECL_ANG(8, 8, sse4);
-DECL_ANG(8, 9, sse4);
-DECL_ANG(8, 10, sse4);
-DECL_ANG(8, 11, sse4);
-DECL_ANG(8, 12, sse4);
-DECL_ANG(8, 13, sse4);
-DECL_ANG(8, 14, sse4);
-DECL_ANG(8, 15, sse4);
-DECL_ANG(8, 16, sse4);
-DECL_ANG(8, 17, sse4);
-DECL_ANG(8, 18, sse4);
-DECL_ANG(8, 26, sse4);
-
-DECL_ANG(16, 2, ssse3);
-DECL_ANG(16, 3, sse4);
-DECL_ANG(16, 4, sse4);
-DECL_ANG(16, 5, sse4);
-DECL_ANG(16, 6, sse4);
-DECL_ANG(16, 7, sse4);
-DECL_ANG(16, 8, sse4);
-DECL_ANG(16, 9, sse4);
-DECL_ANG(16, 10, sse4);
-DECL_ANG(16, 11, sse4);
-DECL_ANG(16, 12, sse4);
-DECL_ANG(16, 13, sse4);
-DECL_ANG(16, 14, sse4);
-DECL_ANG(16, 15, sse4);
-DECL_ANG(16, 16, sse4);
-DECL_ANG(16, 17, sse4);
-DECL_ANG(16, 18, sse4);
-DECL_ANG(16, 26, sse4);
#undef DECL_ANG
+
void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);
void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);
diff -r 5a607dd446ea -r 8e0fa5fcbf15 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Mon Jan 13 11:01:16 2014 -0600
+++ b/source/common/x86/intrapred8.asm Mon Jan 13 16:05:56 2014 -0600
@@ -1105,82 +1105,6 @@ cglobal intra_pred_ang4_18, 4,4,1
psrldq m0, 1
movd [r0], m0
RET
-;-----------------------------------------------------------------------------
-; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang8_2, 3,5,2
- cmp r4m, byte 34
- cmove r2, r3mp
- movu m0, [r2 + 2]
- lea r4, [r1 * 3]
-
- movh [r0], m0
- palignr m1, m0, 1
- movh [r0 + r1], m1
- palignr m1, m0, 2
- movh [r0 + r1 * 2], m1
- palignr m1, m0, 3
- movh [r0 + r4], m1
- palignr m1, m0, 4
- lea r0, [r0 + r1 * 4]
- movh [r0], m1
- palignr m1, m0, 5
- movh [r0 + r1], m1
- palignr m1, m0, 6
- movh [r0 + r1 * 2], m1
- palignr m1, m0, 7
- movh [r0 + r4], m1
- RET
-
-;-----------------------------------------------------------------------------
-; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang16_2, 3,3,3
- cmp r4m, byte 34
- cmove r2, r3mp
- movu m0, [r2 + 2]
- movu m1, [r2 + 18]
- movu [r0], m0
- palignr m2, m1, m0, 1
- movu [r0 + r1], m2
- lea r0, [r0 + r1 * 2]
- palignr m2, m1, m0, 2
- movu [r0], m2
- palignr m2, m1, m0, 3
- movu [r0 + r1], m2
- lea r0, [r0 + r1 * 2]
- palignr m2, m1, m0, 4
- movu [r0], m2
- palignr m2, m1, m0, 5
- movu [r0 + r1], m2
- lea r0, [r0 + r1 * 2]
- palignr m2, m1, m0, 6
- movu [r0], m2
- palignr m2, m1, m0, 7
- movu [r0 + r1], m2
- lea r0, [r0 + r1 * 2]
- palignr m2, m1, m0, 8
- movu [r0], m2
- palignr m2, m1, m0, 9
- movu [r0 + r1], m2
- lea r0, [r0 + r1 * 2]
- palignr m2, m1, m0, 10
- movu [r0], m2
- palignr m2, m1, m0, 11
- movu [r0 + r1], m2
- lea r0, [r0 + r1 * 2]
- palignr m2, m1, m0, 12
- movu [r0], m2
- palignr m2, m1, m0, 13
- movu [r0 + r1], m2
- lea r0, [r0 + r1 * 2]
- palignr m2, m1, m0, 14
- movu [r0], m2
- palignr m2, m1, m0, 15
- movu [r0 + r1], m2
- RET
;-----------------------------------------------------------------------------
; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
diff -r 5a607dd446ea -r 8e0fa5fcbf15 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon Jan 13 11:01:16 2014 -0600
+++ b/source/common/x86/ipfilter8.asm Mon Jan 13 16:05:56 2014 -0600
@@ -29,7 +29,6 @@
SECTION_RODATA 32
tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
- db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10
@@ -128,7 +127,6 @@ tab_c_64_n64: times 8 db 64, -64
SECTION .text
-cextern pw_512
cextern pw_2000
%macro FILTER_H4_w2_2 3
@@ -690,80 +688,30 @@ cglobal interp_8tap_horiz_%3_%1x%2, 4,7,
; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;--------------------------------------------------------------------------------------------------------------
IPFILTER_LUMA 4, 4, pp
+ IPFILTER_LUMA 8, 8, pp
+ IPFILTER_LUMA 8, 4, pp
IPFILTER_LUMA 4, 8, pp
+ IPFILTER_LUMA 16, 16, pp
+ IPFILTER_LUMA 16, 8, pp
+ IPFILTER_LUMA 8, 16, pp
+ IPFILTER_LUMA 16, 12, pp
IPFILTER_LUMA 12, 16, pp
+ IPFILTER_LUMA 16, 4, pp
IPFILTER_LUMA 4, 16, pp
-
-
-;--------------------------------------------------------------------------------------------------------------
-; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;--------------------------------------------------------------------------------------------------------------
-%macro IPFILTER_LUMA_PP_W8 2
-INIT_XMM sse4
-cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7
- mov r4d, r4m
-
-%ifdef PIC
- lea r5, [tab_LumaCoeff]
- movh m3, [r5 + r4 * 8]
-%else
- movh m3, [tab_LumaCoeff + r4 * 8]
-%endif
- pshufd m0, m3, 0 ; m0 = coeff-L
- pshufd m1, m3, 0x55 ; m1 = coeff-H
- lea r5, [tab_Tm] ; r5 = shuffle
- mova m2, [pw_512] ; m2 = 512
-
- mov r4d, %2
-.loopH
-%assign x 0
-%rep %1 / 8
- movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0]
- pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0]
- pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4]
- pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8]
- pmaddubsw m4, m0
- pmaddubsw m6, m5, m1
- pmaddubsw m5, m0
- pmaddubsw m3, m1
- paddw m4, m6
- paddw m5, m3
- phaddw m4, m5
- pmulhrsw m4, m2
- packuswb m4, m4
- movh [r2 + x], m4
-%assign x x+8
-%endrep
-
- add r0, r1
- add r2, r3
-
- dec r4d
- jnz .loopH
- RET
-%endmacro
More information about the x265-commits
mailing list