[x265-commits] [x265] sao: created new primitive for saoCuStatsE1
Dnyaneshwar G
dnyaneshwar at multicorewareinc.com
Mon Jul 6 17:48:40 CEST 2015
details: http://hg.videolan.org/x265/rev/81e30626b711
branches:
changeset: 10764:81e30626b711
user: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date: Wed Jul 01 16:49:24 2015 +0530
description:
sao: created new primitive for saoCuStatsE1
Subject: [x265] sao: created new primitive for saoCuStatsE0
details: http://hg.videolan.org/x265/rev/eca6c4408c90
branches:
changeset: 10765:eca6c4408c90
user: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date: Wed Jul 01 16:50:32 2015 +0530
description:
sao: created new primitive for saoCuStatsE0
Subject: [x265] sao: created new primitive for saoCuStatsBO
details: http://hg.videolan.org/x265/rev/54a45f1f7469
branches:
changeset: 10766:54a45f1f7469
user: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date: Wed Jul 01 16:51:20 2015 +0530
description:
sao: created new primitive for saoCuStatsBO
Subject: [x265] asm: avx2 code for weight_sp() 16bpp
details: http://hg.videolan.org/x265/rev/2fab331dd158
branches:
changeset: 10767:2fab331dd158
user: Aasaipriya Chandran <aasaipriya at multicorewareinc.com>
date: Mon Jun 29 12:49:55 2015 +0530
description:
asm: avx2 code for weight_sp() 16bpp
avx2: weight_sp 11.37x 4496.63 51139.20
sse4: weight_sp 6.48x 8163.87 52870.36
Subject: [x265] motion: fix overflow in mvcost() check failure.
details: http://hg.videolan.org/x265/rev/bf57ce5d38d5
branches:
changeset: 10768:bf57ce5d38d5
user: Divya Manivannan <divya at multicorewareinc.com>
date: Fri Jul 03 19:50:02 2015 +0530
description:
motion: fix overflow in mvcost() check failure.
mvcost() funtion always truncate the value to 16 bits but there is a possiblity
of overflow in the mvcost() check failure condition given.
This check can be done in the cost calculation (s_costs) initially andso it is
removed here.
Subject: [x265] Merge
details: http://hg.videolan.org/x265/rev/aade1fffa9bd
branches:
changeset: 10769:aade1fffa9bd
user: Steve Borho <steve at borho.org>
date: Mon Jul 06 10:48:21 2015 -0500
description:
Merge
diffstat:
build/msys/multilib.sh | 2 +-
source/common/primitives.h | 6 +
source/common/x86/asm-primitives.cpp | 1 +
source/common/x86/pixel-util8.asm | 126 +++++++++++++++++++++++-
source/encoder/motion.cpp | 40 ++-----
source/encoder/sao.cpp | 185 +++++++++++++++++-----------------
6 files changed, 236 insertions(+), 124 deletions(-)
diffs (truncated from 516 to 300 lines):
diff -r 1162fb0b99f8 -r aade1fffa9bd build/msys/multilib.sh
--- a/build/msys/multilib.sh Fri Jul 03 13:43:47 2015 -0500
+++ b/build/msys/multilib.sh Mon Jul 06 10:48:21 2015 -0500
@@ -13,5 +13,5 @@ make ${MAKEFLAGS}
cp libx265.a ../8bit/libx265_main10.a
cd ../8bit
-cmake -G "MSYS Makefiles" ../../../source -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=ON -DEXTRA_LIB="x265_main10.a x265_main12.a" -DEXTRA_LINK_FLAGS=-L.
+cmake -G "MSYS Makefiles" ../../../source -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=ON -DEXTRA_LIB="x265_main10.a;x265_main12.a" -DEXTRA_LINK_FLAGS=-L.
make ${MAKEFLAGS}
diff -r 1162fb0b99f8 -r aade1fffa9bd source/common/primitives.h
--- a/source/common/primitives.h Fri Jul 03 13:43:47 2015 -0500
+++ b/source/common/primitives.h Mon Jul 06 10:48:21 2015 -0500
@@ -174,6 +174,9 @@ typedef void (*saoCuOrgE2_t)(pixel* rec,
typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
+typedef void (*saoCuStatsBO_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
@@ -297,6 +300,9 @@ struct EncoderPrimitives
saoCuOrgE3_t saoCuOrgE3[2];
saoCuOrgB0_t saoCuOrgB0;
+ saoCuStatsBO_t saoCuStatsBO;
+ saoCuStatsE0_t saoCuStatsE0;
+ saoCuStatsE1_t saoCuStatsE1;
saoCuStatsE2_t saoCuStatsE2;
saoCuStatsE3_t saoCuStatsE3;
diff -r 1162fb0b99f8 -r aade1fffa9bd source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Jul 03 13:43:47 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Mon Jul 06 10:48:21 2015 -0500
@@ -1524,6 +1524,7 @@ void setupAssemblyPrimitives(EncoderPrim
p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
p.weight_pp = PFX(weight_pp_avx2);
+ p.weight_sp = PFX(weight_sp_avx2);
p.sign = PFX(calSign_avx2);
p.planecopy_cp = PFX(upShift_8_avx2);
diff -r 1162fb0b99f8 -r aade1fffa9bd source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Fri Jul 03 13:43:47 2015 -0500
+++ b/source/common/x86/pixel-util8.asm Mon Jul 06 10:48:21 2015 -0500
@@ -1674,8 +1674,128 @@ cglobal weight_sp, 6, 7, 7, 0-(2*4)
dec r5d
jnz .loopH
RET
-
-%if ARCH_X86_64
+%endif
+
+
+%if HIGH_BIT_DEPTH
+INIT_YMM avx2
+cglobal weight_sp, 6,7,9
+ mova m1, [pw_1023]
+ mova m2, [pw_1]
+ mov r6d, r7m
+ shl r6d, 16
+ or r6d, r6m
+ vpbroadcastd m3, r6d ; m3 = [round w0]
+ movd xm4, r8m ; m4 = [shift]
+ vpbroadcastd m5, r9m ; m5 = [offset]
+
+ ; correct row stride
+ add r3d, r3d
+ add r2d, r2d
+ mov r6d, r4d
+ and r6d, ~(mmsize / SIZEOF_PIXEL - 1)
+ sub r3d, r6d
+ sub r3d, r6d
+ sub r2d, r6d
+ sub r2d, r6d
+
+ ; generate partial width mask (MUST BE IN YMM0)
+ mov r6d, r4d
+ and r6d, (mmsize / SIZEOF_PIXEL - 1)
+ movd xm0, r6d
+ pshuflw m0, m0, 0
+ punpcklqdq m0, m0
+ vinserti128 m0, m0, xm0, 1
+ pcmpgtw m0, [pw_0_15]
+
+.loopH:
+ mov r6d, r4d
+
+.loopW:
+ movu m6, [r0]
+ paddw m6, [pw_2000]
+
+ punpcklwd m7, m6, m2
+ pmaddwd m7, m3 ;(round w0)
+ psrad m7, xm4 ;(shift)
+ paddd m7, m5 ;(offset)
+
+ punpckhwd m6, m2
+ pmaddwd m6, m3
+ psrad m6, xm4
+ paddd m6, m5
+
+ packusdw m7, m6
+ pminuw m7, m1
+
+ sub r6d, (mmsize / SIZEOF_PIXEL)
+ jl .width14
+ movu [r1], m7
+ lea r0, [r0 + mmsize]
+ lea r1, [r1 + mmsize]
+ je .nextH
+ jmp .loopW
+
+.width14:
+ add r6d, 16
+ cmp r6d, 14
+ jl .width12
+ movu [r1], xm7
+ vextracti128 xm8, m7, 1
+ movq [r1 + 16], xm8
+ pextrd [r1 + 24], xm8, 2
+ je .nextH
+
+.width12:
+ cmp r6d, 12
+ jl .width10
+ movu [r1], xm7
+ vextracti128 xm8, m7, 1
+ movq [r1 + 16], xm8
+ je .nextH
+
+.width10:
+ cmp r6d, 10
+ jl .width8
+ movu [r1], xm7
+ vextracti128 xm8, m7, 1
+ movd [r1 + 16], xm8
+ je .nextH
+
+.width8:
+ cmp r6d, 8
+ jl .width6
+ movu [r1], xm7
+ je .nextH
+
+.width6
+ cmp r6d, 6
+ jl .width4
+ movq [r1], xm7
+ pextrd [r1 + 8], xm7, 2
+ je .nextH
+
+.width4:
+ cmp r6d, 4
+ jl .width2
+ movq [r1], xm7
+ je .nextH
+ add r1, 4
+ pshufd m6, m6, 1
+ je .nextH
+
+.width2:
+ movd [r1], xm7
+
+.nextH:
+ add r0, r2
+ add r1, r3
+
+ dec r5d
+ jnz .loopH
+ RET
+
+%else
INIT_YMM avx2
cglobal weight_sp, 6, 9, 7
mov r7d, r7m
@@ -1752,8 +1872,6 @@ cglobal weight_sp, 6, 9, 7
jnz .loopH
RET
%endif
-%endif ; end of (HIGH_BIT_DEPTH == 0)
-
;-----------------------------------------------------------------
; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
diff -r 1162fb0b99f8 -r aade1fffa9bd source/encoder/motion.cpp
--- a/source/encoder/motion.cpp Fri Jul 03 13:43:47 2015 -0500
+++ b/source/encoder/motion.cpp Mon Jul 06 10:48:21 2015 -0500
@@ -234,14 +234,9 @@ void MotionEstimate::setSourcePU(const Y
pix_base + (m1x) + (m1y) * stride, \
pix_base + (m2x) + (m2y) * stride, \
stride, costs); \
- const uint16_t *base_mvx = &m_cost_mvx[(bmv.x + (m0x)) << 2]; \
- const uint16_t *base_mvy = &m_cost_mvy[(bmv.y + (m0y)) << 2]; \
- X265_CHECK(mvcost((bmv + MV(m0x, m0y)) << 2) == (base_mvx[((m0x) - (m0x)) << 2] + base_mvy[((m0y) - (m0y)) << 2]), "mvcost() check failure\n"); \
- X265_CHECK(mvcost((bmv + MV(m1x, m1y)) << 2) == (base_mvx[((m1x) - (m0x)) << 2] + base_mvy[((m1y) - (m0y)) << 2]), "mvcost() check failure\n"); \
- X265_CHECK(mvcost((bmv + MV(m2x, m2y)) << 2) == (base_mvx[((m2x) - (m0x)) << 2] + base_mvy[((m2y) - (m0y)) << 2]), "mvcost() check failure\n"); \
- (costs)[0] += (base_mvx[((m0x) - (m0x)) << 2] + base_mvy[((m0y) - (m0y)) << 2]); \
- (costs)[1] += (base_mvx[((m1x) - (m0x)) << 2] + base_mvy[((m1y) - (m0y)) << 2]); \
- (costs)[2] += (base_mvx[((m2x) - (m0x)) << 2] + base_mvy[((m2y) - (m0y)) << 2]); \
+ (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
+ (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
+ (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
}
#define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
@@ -271,16 +266,10 @@ void MotionEstimate::setSourcePU(const Y
pix_base + (m2x) + (m2y) * stride, \
pix_base + (m3x) + (m3y) * stride, \
stride, costs); \
- const uint16_t *base_mvx = &m_cost_mvx[(omv.x << 2)]; \
- const uint16_t *base_mvy = &m_cost_mvy[(omv.y << 2)]; \
- X265_CHECK(mvcost((omv + MV(m0x, m0y)) << 2) == (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]), "mvcost() check failure\n"); \
- X265_CHECK(mvcost((omv + MV(m1x, m1y)) << 2) == (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]), "mvcost() check failure\n"); \
- X265_CHECK(mvcost((omv + MV(m2x, m2y)) << 2) == (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]), "mvcost() check failure\n"); \
- X265_CHECK(mvcost((omv + MV(m3x, m3y)) << 2) == (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]), "mvcost() check failure\n"); \
- costs[0] += (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]); \
- costs[1] += (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]); \
- costs[2] += (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]); \
- costs[3] += (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]); \
+ costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
+ costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
+ costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
+ costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
@@ -296,17 +285,10 @@ void MotionEstimate::setSourcePU(const Y
pix_base + (m2x) + (m2y) * stride, \
pix_base + (m3x) + (m3y) * stride, \
stride, costs); \
- /* TODO: use restrict keyword in ICL */ \
- const uint16_t *base_mvx = &m_cost_mvx[(bmv.x << 2)]; \
- const uint16_t *base_mvy = &m_cost_mvy[(bmv.y << 2)]; \
- X265_CHECK(mvcost((bmv + MV(m0x, m0y)) << 2) == (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]), "mvcost() check failure\n"); \
- X265_CHECK(mvcost((bmv + MV(m1x, m1y)) << 2) == (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]), "mvcost() check failure\n"); \
- X265_CHECK(mvcost((bmv + MV(m2x, m2y)) << 2) == (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]), "mvcost() check failure\n"); \
- X265_CHECK(mvcost((bmv + MV(m3x, m3y)) << 2) == (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]), "mvcost() check failure\n"); \
- (costs)[0] += (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]); \
- (costs)[1] += (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]); \
- (costs)[2] += (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]); \
- (costs)[3] += (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]); \
+ (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
+ (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
+ (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
+ (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
}
#define DIA1_ITER(mx, my) \
diff -r 1162fb0b99f8 -r aade1fffa9bd source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Fri Jul 03 13:43:47 2015 -0500
+++ b/source/encoder/sao.cpp Mon Jul 06 10:48:21 2015 -0500
@@ -671,7 +671,6 @@ void SAO::copySaoUnit(SaoCtuParam* saoUn
/* Calculate SAO statistics for current CTU without non-crossing slice */
void SAO::calcSaoStatsCu(int addr, int plane)
{
- int x, y;
const CUData* cu = m_frame->m_encData->getPicCTU(addr);
const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
const pixel* rec0 = m_frame->m_reconPic->getPlaneAddr(plane, addr);
@@ -702,8 +701,6 @@ void SAO::calcSaoStatsCu(int addr, int p
int startY;
int endX;
int endY;
- int32_t* stats;
- int32_t* count;
int skipB = plane ? 2 : 4;
int skipR = plane ? 3 : 5;
@@ -711,41 +708,18 @@ void SAO::calcSaoStatsCu(int addr, int p
int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
- // Dynamic Range: 64x64x14bpp = 24bits
- int32_t tmp_stats[NUM_EDGETYPE];
- // TODO: improve by uint64_t, but need Haswell SHLX
- uint16_t tmp_count[NUM_EDGETYPE];
-
// SAO_BO:
{
- const int boShift = X265_DEPTH - SAO_BO_BITS;
-
if (m_param->bSaoNonDeblocked)
{
skipB = plane ? 1 : 3;
skipR = plane ? 2 : 4;
}
- stats = m_offsetOrg[plane][SAO_BO];
- count = m_count[plane][SAO_BO];
-
- fenc = fenc0;
- rec = rec0;
endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
- for (y = 0; y < endY; y++)
- {
More information about the x265-commits
mailing list