[x265-commits] [x265] fix build fault on WinXP
Min Chen
chenm003 at 163.com
Fri Apr 24 21:22:23 CEST 2015
details: http://hg.videolan.org/x265/rev/717dd61e2799
branches:
changeset: 10273:717dd61e2799
user: Min Chen <chenm003 at 163.com>
date: Fri Apr 24 21:45:55 2015 +0800
description:
fix build fault on WinXP
Subject: [x265] rdoQuant: split coeff cost into psy and non-psy path
details: http://hg.videolan.org/x265/rev/56ac27335f04
branches:
changeset: 10274:56ac27335f04
user: Min Chen <chenm003 at 163.com>
date: Fri Apr 24 21:45:58 2015 +0800
description:
rdoQuant: split coeff cost into psy and non-psy path
Subject: [x265] rdoQuant: optimize getSigCtxInc()
details: http://hg.videolan.org/x265/rev/3d02057c6e65
branches:
changeset: 10275:3d02057c6e65
user: Min Chen <chenm003 at 163.com>
date: Fri Apr 24 21:46:01 2015 +0800
description:
rdoQuant: optimize getSigCtxInc()
Subject: [x265] asm: avx2 code for chroma vsp filter for i422
details: http://hg.videolan.org/x265/rev/07af054786bf
branches:
changeset: 10276:07af054786bf
user: Sumalatha Polureddy
date: Fri Apr 24 12:07:52 2015 +0530
description:
asm: avx2 code for chroma vsp filter for i422
Subject: [x265] asm: avx2 code for chroma vsp filter for i444
details: http://hg.videolan.org/x265/rev/27081950e06e
branches:
changeset: 10277:27081950e06e
user: Sumalatha Polureddy
date: Fri Apr 24 13:49:59 2015 +0530
description:
asm: avx2 code for chroma vsp filter for i444
Subject: [x265] asm: avx2 code for chroma vps filter for i422
details: http://hg.videolan.org/x265/rev/b665d3a0ef4b
branches:
changeset: 10278:b665d3a0ef4b
user: Sumalatha Polureddy
date: Fri Apr 24 15:32:14 2015 +0530
description:
asm: avx2 code for chroma vps filter for i422
Subject: [x265] asm: avx2 code for chroma vps filter for i444
details: http://hg.videolan.org/x265/rev/0d7402f9ca6b
branches:
changeset: 10279:0d7402f9ca6b
user: Sumalatha Polureddy
date: Fri Apr 24 16:44:32 2015 +0530
description:
asm: avx2 code for chroma vps filter for i444
Subject: [x265] asm: avx2 code for sign primitive: improve 204c->114c
details: http://hg.videolan.org/x265/rev/0380e8bb5e95
branches:
changeset: 10280:0380e8bb5e95
user: Divya Manivannan <divya at multicorewareinc.com>
date: Fri Apr 24 13:47:58 2015 +0530
description:
asm: avx2 code for sign primitive: improve 204c->114c
Subject: [x265] asm: add pixel restoration part in saoCuOrgE2 primitive
details: http://hg.videolan.org/x265/rev/b7c98159982a
branches:
changeset: 10281:b7c98159982a
user: Divya Manivannan <divya at multicorewareinc.com>
date: Fri Apr 24 14:10:55 2015 +0530
description:
asm: add pixel restoration part in saoCuOrgE2 primitive
Subject: [x265] asm: avx2 10bit code for add_ps[16x16],[32x32],[64x64]
details: http://hg.videolan.org/x265/rev/9356573c022a
branches:
changeset: 10282:9356573c022a
user: Rajesh Paulraj<rajesh at multicorewareinc.com>
date: Fri Apr 24 19:20:36 2015 +0530
description:
asm: avx2 10bit code for add_ps[16x16],[32x32],[64x64]
add_ps[16x16](19.29x), add_ps[32x32](22.42x), add_ps[64x64](26.69x)
Subject: [x265] asm: avx2 10bit code for add_ps for chroma sizes 16xN, 32xN, reuse luma code
details: http://hg.videolan.org/x265/rev/55eedcef3708
branches:
changeset: 10283:55eedcef3708
user: Rajesh Paulraj<rajesh at multicorewareinc.com>
date: Fri Apr 24 19:22:48 2015 +0530
description:
asm: avx2 10bit code for add_ps for chroma sizes 16xN, 32xN, reuse luma code
diffstat:
source/CMakeLists.txt | 2 +-
source/common/quant.cpp | 114 +++++++++++--
source/common/x86/asm-primitives.cpp | 79 ++++++++++
source/common/x86/loopfilter.asm | 146 +++++++++++++-----
source/common/x86/loopfilter.h | 1 +
source/common/x86/pixeladd8.asm | 270 +++++++++++++++++++++++++++++++++-
source/encoder/sao.cpp | 17 +--
source/test/pixelharness.cpp | 2 +-
8 files changed, 537 insertions(+), 94 deletions(-)
diffs (truncated from 865 to 300 lines):
diff -r a35fafa25df2 -r 55eedcef3708 source/CMakeLists.txt
--- a/source/CMakeLists.txt Thu Apr 23 12:32:49 2015 -0500
+++ b/source/CMakeLists.txt Fri Apr 24 19:22:48 2015 +0530
@@ -305,7 +305,7 @@ if (WIN32)
if(WINXP_SUPPORT)
# force use of workarounds for CONDITION_VARIABLE and atomic
# intrinsics introduced after XP
- add_definitions(-D_WIN32_WINNT=_WIN32_WINNT_WINXP)
+ add_definitions(-D_WIN32_WINNT=_WIN32_WINNT_WINXP -D_WIN32_WINNT_WIN7=0x0601)
else(WINXP_SUPPORT)
# default to targeting Windows 7 for the NUMA APIs
add_definitions(-D_WIN32_WINNT=_WIN32_WINNT_WIN7)
diff -r a35fafa25df2 -r 55eedcef3708 source/common/quant.cpp
--- a/source/common/quant.cpp Thu Apr 23 12:32:49 2015 -0500
+++ b/source/common/quant.cpp Fri Apr 24 19:22:48 2015 +0530
@@ -659,6 +659,45 @@ uint32_t Quant::rdoQuant(const CUData& c
}
}
+ static const uint8_t table_cnt[5][SCAN_SET_SIZE] =
+ {
+ // patternSigCtx = 0
+ {
+ 2, 1, 1, 0,
+ 1, 1, 0, 0,
+ 1, 0, 0, 0,
+ 0, 0, 0, 0,
+ },
+ // patternSigCtx = 1
+ {
+ 2, 2, 2, 2,
+ 1, 1, 1, 1,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ },
+ // patternSigCtx = 2
+ {
+ 2, 1, 0, 0,
+ 2, 1, 0, 0,
+ 2, 1, 0, 0,
+ 2, 1, 0, 0,
+ },
+ // patternSigCtx = 3
+ {
+ 2, 2, 2, 2,
+ 2, 2, 2, 2,
+ 2, 2, 2, 2,
+ 2, 2, 2, 2,
+ },
+ // 4x4
+ {
+ 0, 1, 4, 5,
+ 2, 3, 4, 5,
+ 6, 6, 8, 8,
+ 7, 7, 8, 8
+ }
+ };
+
/* iterate over coding groups in reverse scan order */
for (int cgScanPos = cgLastScanPos; cgScanPos >= 0; cgScanPos--)
{
@@ -668,6 +707,7 @@ uint32_t Quant::rdoQuant(const CUData& c
const uint32_t cgPosX = cgBlkPos - (cgPosY << codeParams.log2TrSizeCG);
const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
+ const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0);
if (c1 == 0)
ctxSet++;
@@ -676,32 +716,63 @@ uint32_t Quant::rdoQuant(const CUData& c
if (cgScanPos && (coeffNum[cgScanPos] == 0))
{
// TODO: does we need zero-coeff cost?
- for (int scanPosinCG = 0; scanPosinCG < SCAN_SET_SIZE; scanPosinCG++)
+ const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
+ uint32_t blkPos = codeParams.scan[scanPosBase];
+
+ if (usePsyMask)
{
- scanPos = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
- uint32_t blkPos = codeParams.scan[scanPos];
+ // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
+ for (int y = 0; y < MLS_CG_SIZE; y++)
+ {
+ for (int x = 0; x < MLS_CG_SIZE; x++)
+ {
+ int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
+ int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
- // TODO: get 16 of ctxSig
- const uint32_t ctxSig = getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext);
+ costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
- /* set default costs to uncoded costs */
- int signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */
- int predictedCoef = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
+ /* when no residual coefficient is coded, predicted coef == recon coef */
+ costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
- /* cost of not coding this coefficient (all distortion, no signal bits) */
- costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits;
+ totalUncodedCost += costUncoded[blkPos + x];
+ totalRdCost += costUncoded[blkPos + x];
- X265_CHECK(scanPos > 0, "scanPos failure\n");
- if (usePsyMask)
- /* when no residual coefficient is coded, predicted coef == recon coef */
- costUncoded[blkPos] -= PSYVALUE(predictedCoef);
+ const uint32_t scanPosOffset = y * MLS_CG_SIZE + x;
+ const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
+ X265_CHECK(trSize > 4, "trSize check failure\n");
+ X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
- totalUncodedCost += costUncoded[blkPos];
+ costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[ctxSig][0]);
+ costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
+ sigRateDelta[blkPos + x] = estBitsSbac.significantBits[ctxSig][1] - estBitsSbac.significantBits[ctxSig][0];
+ }
+ blkPos += trSize;
+ }
+ }
+ else
+ {
+ // non-psy path
+ for (int y = 0; y < MLS_CG_SIZE; y++)
+ {
+ for (int x = 0; x < MLS_CG_SIZE; x++)
+ {
+ int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
+ costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
- costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[ctxSig][0]);
- costCoeff[scanPos] = costUncoded[blkPos];
- sigRateDelta[blkPos] = estBitsSbac.significantBits[ctxSig][1] - estBitsSbac.significantBits[ctxSig][0];
- totalRdCost += costCoeff[scanPos];
+ totalUncodedCost += costUncoded[blkPos + x];
+ totalRdCost += costUncoded[blkPos + x];
+
+ const uint32_t scanPosOffset = y * MLS_CG_SIZE + x;
+ const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
+ X265_CHECK(trSize > 4, "trSize check failure\n");
+ X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
+
+ costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[ctxSig][0]);
+ costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
+ sigRateDelta[blkPos + x] = estBitsSbac.significantBits[ctxSig][1] - estBitsSbac.significantBits[ctxSig][0];
+ }
+ blkPos += trSize;
+ }
}
/* there were no coded coefficients in this coefficient group */
@@ -745,6 +816,8 @@ uint32_t Quant::rdoQuant(const CUData& c
// coefficient level estimation
const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1];
+ const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
+ X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
// before find lastest non-zero coeff
if (scanPos > (uint32_t)lastScanPos)
@@ -761,8 +834,6 @@ uint32_t Quant::rdoQuant(const CUData& c
else if (!(subFlagMask & 1))
{
// fast zero coeff path
- const uint32_t ctxSig = getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext);
-
/* set default costs to uncoded costs */
costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[ctxSig][0]);
costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
@@ -794,7 +865,6 @@ uint32_t Quant::rdoQuant(const CUData& c
sigRateDelta[blkPos] = 0;
else
{
- const uint32_t ctxSig = getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext);
if (maxAbsLevel < 3)
{
/* set default costs to uncoded costs */
diff -r a35fafa25df2 -r 55eedcef3708 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Apr 23 12:32:49 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Fri Apr 24 19:22:48 2015 +0530
@@ -1223,6 +1223,14 @@ void setupAssemblyPrimitives(EncoderPrim
ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2);
ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
+ p.cu[BLOCK_16x16].add_ps = x265_pixel_add_ps_16x16_avx2;
+ p.cu[BLOCK_32x32].add_ps = x265_pixel_add_ps_32x32_avx2;
+ p.cu[BLOCK_64x64].add_ps = x265_pixel_add_ps_64x64_avx2;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = x265_pixel_add_ps_16x16_avx2;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = x265_pixel_add_ps_32x32_avx2;
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = x265_pixel_add_ps_16x32_avx2;
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = x265_pixel_add_ps_32x64_avx2;
+
p.cu[BLOCK_16x16].sub_ps = x265_pixel_sub_ps_16x16_avx2;
p.cu[BLOCK_32x32].sub_ps = x265_pixel_sub_ps_32x32_avx2;
p.cu[BLOCK_64x64].sub_ps = x265_pixel_sub_ps_64x64_avx2;
@@ -1720,6 +1728,7 @@ void setupAssemblyPrimitives(EncoderPrim
p.saoCuOrgE3[0] = x265_saoCuOrgE3_avx2;
p.saoCuOrgE3[1] = x265_saoCuOrgE3_32_avx2;
p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
+ p.sign = x265_calSign_avx2;
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
@@ -2434,6 +2443,76 @@ void setupAssemblyPrimitives(EncoderPrim
p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = x265_interp_4tap_horiz_ps_32x24_avx2;
p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = x265_interp_4tap_horiz_ps_32x8_avx2;
+ //i422 for chroma_vsp
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vsp = x265_interp_4tap_vert_sp_4x8_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vsp = x265_interp_4tap_vert_sp_8x16_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vsp = x265_interp_4tap_vert_sp_16x32_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vsp = x265_interp_4tap_vert_sp_4x4_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_vsp = x265_interp_4tap_vert_sp_2x8_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vsp = x265_interp_4tap_vert_sp_8x8_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vsp = x265_interp_4tap_vert_sp_4x16_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vsp = x265_interp_4tap_vert_sp_16x16_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vsp = x265_interp_4tap_vert_sp_8x32_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vsp = x265_interp_4tap_vert_sp_32x32_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vsp = x265_interp_4tap_vert_sp_8x4_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vsp = x265_interp_4tap_vert_sp_16x8_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vsp = x265_interp_4tap_vert_sp_32x16_avx2;
+
+ //i444 for chroma_vsp
+ p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vsp = x265_interp_4tap_vert_sp_4x4_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vsp = x265_interp_4tap_vert_sp_8x8_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vsp = x265_interp_4tap_vert_sp_16x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = x265_interp_4tap_vert_sp_32x32_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vsp = x265_interp_4tap_vert_sp_8x4_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vsp = x265_interp_4tap_vert_sp_4x8_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vsp = x265_interp_4tap_vert_sp_16x8_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vsp = x265_interp_4tap_vert_sp_8x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vsp = x265_interp_4tap_vert_sp_32x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vsp = x265_interp_4tap_vert_sp_16x32_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vsp = x265_interp_4tap_vert_sp_16x12_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vsp = x265_interp_4tap_vert_sp_12x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vsp = x265_interp_4tap_vert_sp_16x4_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vsp = x265_interp_4tap_vert_sp_4x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vsp = x265_interp_4tap_vert_sp_32x24_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = x265_interp_4tap_vert_sp_24x32_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vsp = x265_interp_4tap_vert_sp_32x8_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vsp = x265_interp_4tap_vert_sp_8x32_avx2;
+
+ //i422 for chroma_vps
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vps = x265_interp_4tap_vert_ps_4x8_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vps = x265_interp_4tap_vert_ps_8x16_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vps = x265_interp_4tap_vert_ps_16x32_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_vps = x265_interp_4tap_vert_ps_2x8_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vps = x265_interp_4tap_vert_ps_8x8_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vps = x265_interp_4tap_vert_ps_4x16_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vps = x265_interp_4tap_vert_ps_16x16_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vps = x265_interp_4tap_vert_ps_8x32_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = x265_interp_4tap_vert_ps_32x32_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vps = x265_interp_4tap_vert_ps_8x4_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vps = x265_interp_4tap_vert_ps_16x8_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = x265_interp_4tap_vert_ps_32x16_avx2;
+
+ //i444 for chroma_vps
+ p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vps = x265_interp_4tap_vert_ps_8x8_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vps = x265_interp_4tap_vert_ps_16x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = x265_interp_4tap_vert_ps_32x32_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vps = x265_interp_4tap_vert_ps_8x4_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vps = x265_interp_4tap_vert_ps_4x8_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vps = x265_interp_4tap_vert_ps_16x8_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vps = x265_interp_4tap_vert_ps_8x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = x265_interp_4tap_vert_ps_32x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vps = x265_interp_4tap_vert_ps_16x32_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vps = x265_interp_4tap_vert_ps_16x12_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vps = x265_interp_4tap_vert_ps_12x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vps = x265_interp_4tap_vert_ps_16x4_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vps = x265_interp_4tap_vert_ps_4x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = x265_interp_4tap_vert_ps_32x24_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vps = x265_interp_4tap_vert_ps_24x32_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = x265_interp_4tap_vert_ps_32x8_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vps = x265_interp_4tap_vert_ps_8x32_avx2;
+
if (cpuMask & X265_CPU_BMI2)
p.scanPosLast = x265_scanPosLast_avx2_bmi2;
}
diff -r a35fafa25df2 -r 55eedcef3708 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Thu Apr 23 12:32:49 2015 -0500
+++ b/source/common/x86/loopfilter.asm Fri Apr 24 19:22:48 2015 +0530
@@ -30,6 +30,8 @@
SECTION_RODATA 32
pb_31: times 32 db 31
pb_15: times 32 db 15
+pb_movemask_32: times 32 db 0x00
+ times 32 db 0xFF
SECTION .text
cextern pb_1
@@ -404,60 +406,66 @@ cglobal saoCuOrgE1_2Rows, 3, 5, 7, pRec,
; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride)
;======================================================================================================================================================
INIT_XMM sse4
-cglobal saoCuOrgE2, 5, 7, 8, rec, bufft, buff1, offsetEo, lcuWidth
-
- mov r6, 16
More information about the x265-commits
mailing list