[x265-commits] [x265] asm: interp_4tap_horiz_pp sse3
David T Yuen
dtyx265 at gmail.com
Wed Apr 22 18:31:36 CEST 2015
details: http://hg.videolan.org/x265/rev/878ae164f423
branches:
changeset: 10248:878ae164f423
user: David T Yuen <dtyx265 at gmail.com>
date: Tue Apr 21 20:37:47 2015 -0700
description:
asm: interp_4tap_horiz_pp sse3
This replaces c code for 6x8, 6x16, 8x2, 8x4, 8x6, 8x8, 8x12, 8x16, 8x32, 8x64, 12x16, 12x32, 16x8, 16x12,
16x16, 16x24, 16x32, 16x64, 24x32, 24x64, 32x8, 32x16, 32x24, 32x32, 32x48, 32x64, 48x64, 64x16, 64x32,
64x48, 64x64
Macros are used to add the primitives to asm-primitives.cpp
64-bit
./test/TestBench --testbench interp | grep hpp
chroma_hpp[ 8x8] 3.02x 3087.49 9315.21
chroma_hpp[16x16] 3.09x 11813.11 36504.26
chroma_hpp[32x32] 3.45x 46862.27 161615.81
chroma_hpp[ 8x4] 2.94x 1567.50 4614.73
chroma_hpp[ 16x8] 3.10x 5930.00 18377.70
chroma_hpp[ 8x16] 3.02x 6130.00 18520.00
chroma_hpp[32x16] 3.46x 23330.07 80829.76
chroma_hpp[16x32] 3.16x 23572.66 74452.23
chroma_hpp[ 8x6] 2.93x 2339.99 6863.20
chroma_hpp[ 6x8] 2.52x 2812.50 7075.69
chroma_hpp[ 8x2] 2.25x 812.50 1830.00
chroma_hpp[16x12] 3.10x 8875.07 27545.60
chroma_hpp[12x16] 2.80x 9810.20 27476.36
chroma_hpp[ 16x4] 3.05x 2995.20 9144.59
chroma_hpp[32x24] 3.45x 34970.45 120594.13
chroma_hpp[24x32] 3.49x 35116.79 122662.94
chroma_hpp[ 32x8] 3.45x 11699.82 40402.34
chroma_hpp[ 8x32] 3.00x 12210.00 36603.46
chroma_hpp[ 8x16] 3.02x 6130.00 18520.00
chroma_hpp[16x32] 3.09x 23573.10 72827.95
chroma_hpp[32x64] 3.58x 93938.63 335978.50
chroma_hpp[ 8x8] 3.02x 3087.49 9314.74
chroma_hpp[16x16] 3.09x 11815.00 36545.97
chroma_hpp[ 8x32] 3.02x 12212.27 36870.14
chroma_hpp[32x32] 3.45x 46748.56 161259.67
chroma_hpp[16x64] 3.18x 47185.50 150017.53
chroma_hpp[ 8x12] 3.04x 4607.50 14000.63
chroma_hpp[ 6x16] 2.49x 5570.10 13870.01
chroma_hpp[ 8x4] 2.94x 1570.00 4613.64
chroma_hpp[16x24] 3.08x 17690.69 54547.18
chroma_hpp[12x32] 2.80x 19618.33 54833.57
chroma_hpp[ 16x8] 3.10x 5932.57 18377.34
chroma_hpp[32x48] 3.45x 70041.92 241370.78
chroma_hpp[24x64] 3.53x 70596.84 249020.33
chroma_hpp[32x16] 3.44x 23374.66 80340.53
chroma_hpp[ 8x64] 3.00x 24422.17 73313.97
chroma_hpp[ 8x8] 3.01x 3090.00 9314.26
chroma_hpp[16x16] 3.11x 11810.00 36736.14
chroma_hpp[32x32] 3.47x 46771.40 162154.16
chroma_hpp[64x64] 3.25x 195843.97 636910.44
chroma_hpp[ 8x4] 2.94x 1570.00 4613.35
chroma_hpp[ 16x8] 3.10x 5933.42 18381.31
chroma_hpp[ 8x16] 3.02x 6131.43 18520.17
chroma_hpp[32x16] 3.42x 23450.76 80160.37
chroma_hpp[16x32] 3.09x 23619.58 73027.41
chroma_hpp[64x32] 3.42x 92894.85 318107.38
chroma_hpp[32x64] 3.48x 93646.98 325950.78
chroma_hpp[16x12] 3.10x 8874.99 27503.11
chroma_hpp[12x16] 2.83x 9809.99 27769.48
chroma_hpp[ 16x4] 3.05x 2994.99 9138.53
chroma_hpp[32x24] 3.42x 35123.29 120115.27
chroma_hpp[24x32] 3.53x 35143.41 124032.27
chroma_hpp[ 32x8] 3.46x 11692.58 40400.25
chroma_hpp[ 8x32] 3.02x 12212.50 36843.57
chroma_hpp[64x48] 3.36x 140979.36 473912.28
chroma_hpp[48x64] 3.43x 140712.88 482047.69
chroma_hpp[64x16] 3.39x 46530.16 157859.31
chroma_hpp[16x64] 3.08x 47197.85 145477.02
32-bit
./test/TestBench --testbench interp | grep hpp
chroma_hpp[ 8x8] 2.96x 3164.98 9354.15
chroma_hpp[16x16] 3.07x 11885.01 36438.13
chroma_hpp[32x32] 3.48x 46818.91 162929.45
chroma_hpp[ 8x4] 2.86x 1645.00 4703.57
chroma_hpp[ 16x8] 3.06x 6005.10 18378.64
chroma_hpp[ 8x16] 2.97x 6205.00 18429.90
chroma_hpp[32x16] 3.46x 23463.52 81110.52
chroma_hpp[16x32] 3.10x 23700.07 73429.12
chroma_hpp[ 8x6] 2.89x 2404.99 6942.73
chroma_hpp[ 6x8] 2.46x 2905.00 7155.45
chroma_hpp[ 8x2] 2.69x 885.00 2379.96
chroma_hpp[16x12] 3.07x 8945.04 27458.99
chroma_hpp[12x16] 2.81x 9862.55 27753.80
chroma_hpp[ 16x4] 3.01x 3065.00 9231.22
chroma_hpp[32x24] 3.45x 35140.03 121204.09
chroma_hpp[24x32] 3.51x 35262.80 123779.88
chroma_hpp[ 32x8] 3.47x 11765.00 40847.72
chroma_hpp[ 8x32] 2.98x 12285.00 36623.77
chroma_hpp[ 8x16] 2.97x 6205.00 18429.95
chroma_hpp[16x32] 3.08x 23691.43 72971.20
chroma_hpp[32x64] 3.47x 93595.39 324758.03
chroma_hpp[ 8x8] 2.95x 3165.39 9353.01
chroma_hpp[16x16] 3.07x 11885.00 36438.18
chroma_hpp[ 8x32] 2.98x 12285.21 36614.84
chroma_hpp[32x32] 3.48x 46794.59 162647.84
chroma_hpp[16x64] 3.08x 47299.79 145605.62
chroma_hpp[ 8x12] 2.98x 4685.06 13949.95
chroma_hpp[ 6x16] 2.46x 5672.50 13972.76
chroma_hpp[ 8x4] 2.86x 1645.00 4702.53
chroma_hpp[16x24] 3.06x 17765.06 54398.70
chroma_hpp[12x32] 2.79x 19676.93 54843.11
chroma_hpp[ 16x8] 3.06x 6005.12 18377.65
chroma_hpp[32x48] 3.46x 70176.74 243033.73
chroma_hpp[24x64] 3.51x 70367.40 246988.72
chroma_hpp[32x16] 3.47x 23405.43 81235.64
chroma_hpp[ 8x64] 2.97x 24490.71 72757.92
chroma_hpp[ 8x8] 2.95x 3165.00 9352.45
chroma_hpp[16x16] 3.07x 11885.00 36437.35
chroma_hpp[32x32] 3.48x 46781.39 162731.84
chroma_hpp[64x64] 3.28x 193972.66 635870.62
chroma_hpp[ 8x4] 2.86x 1645.00 4702.79
chroma_hpp[ 16x8] 3.06x 6005.00 18377.74
chroma_hpp[ 8x16] 2.97x 6205.04 18430.28
chroma_hpp[32x16] 3.46x 23452.05 81121.86
chroma_hpp[16x32] 3.07x 23695.18 72740.23
chroma_hpp[64x32] 3.42x 92974.16 317723.12
chroma_hpp[32x64] 3.47x 93467.95 324431.16
chroma_hpp[16x12] 3.07x 8945.09 27457.70
chroma_hpp[12x16] 2.79x 9862.54 27477.89
chroma_hpp[ 16x4] 3.01x 3065.02 9231.55
chroma_hpp[32x24] 3.45x 35161.96 121188.20
chroma_hpp[24x32] 3.51x 35275.57 123776.31
chroma_hpp[ 32x8] 3.47x 11765.00 40847.59
chroma_hpp[ 8x32] 2.98x 12285.06 36637.80
chroma_hpp[64x48] 3.41x 139693.42 476274.88
chroma_hpp[48x64] 3.44x 139707.61 480515.22
chroma_hpp[64x16] 3.41x 46575.90 158769.59
chroma_hpp[16x64] 3.08x 47262.82 145408.81
Subject: [x265] sao: remove saoCuOrgE3_2Rows function and modify saoCuOrgE3 primitive to handle width=16 seperately
details: http://hg.videolan.org/x265/rev/ee1f11edcb30
branches:
changeset: 10249:ee1f11edcb30
user: Divya Manivannan <divya at multicorewareinc.com>
date: Wed Apr 22 11:59:36 2015 +0530
description:
sao: remove saoCuOrgE3_2Rows function and modify saoCuOrgE3 primitive to handle width=16 seperately
Subject: [x265] asm: saoCuOrgE3 avx2 code for width>16: improve 508c->427c
details: http://hg.videolan.org/x265/rev/996d9020066d
branches:
changeset: 10250:996d9020066d
user: Divya Manivannan <divya at multicorewareinc.com>
date: Wed Apr 22 12:25:09 2015 +0530
description:
asm: saoCuOrgE3 avx2 code for width>16: improve 508c->427c
Subject: [x265] asm: avx2 code for satd_16xN, improved over ~50% than SSE code
details: http://hg.videolan.org/x265/rev/065a227130f9
branches:
changeset: 10251:065a227130f9
user: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date: Wed Apr 22 11:19:18 2015 +0530
description:
asm: avx2 code for satd_16xN, improved over ~50% than SSE code
Subject: [x265] asm: avx2 code for satd_48x64 and 64xN, improved over ~50% than SSE
details: http://hg.videolan.org/x265/rev/e6f6db57a39e
branches:
changeset: 10252:e6f6db57a39e
user: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date: Wed Apr 22 11:27:39 2015 +0530
description:
asm: avx2 code for satd_48x64 and 64xN, improved over ~50% than SSE
Subject: [x265] asm: avx2 10bit code for scale2D_64to32
details: http://hg.videolan.org/x265/rev/f1dc261cbfc3
branches:
changeset: 10253:f1dc261cbfc3
user: Rajesh Paulraj<rajesh at multicorewareinc.com>
date: Wed Apr 22 19:52:06 2015 +0530
description:
asm: avx2 10bit code for scale2D_64to32
AVX2:
scale2D_64to32 17.07x 3873.16 44301.99
SSSE3:
scale2D_64to32 2.75x 14407.30 39553.04
Subject: [x265] asm: fix bug in generic version findPosLast_x64 and improve testbench on it
details: http://hg.videolan.org/x265/rev/dc81ab3cc80e
branches:
changeset: 10254:dc81ab3cc80e
user: Min Chen <chenm003 at 163.com>
date: Wed Apr 22 21:30:33 2015 +0800
description:
asm: fix bug in generic version findPosLast_x64 and improve testbench on it
Subject: [x265] asm: rename findPosLast to scanPosLast and modify its API
details: http://hg.videolan.org/x265/rev/649d12bcfbf2
branches:
changeset: 10255:649d12bcfbf2
user: Min Chen <chenm003 at 163.com>
date: Wed Apr 22 21:30:37 2015 +0800
description:
asm: rename findPosLast to scanPosLast and modify its API
Subject: [x265] testbench: support BMI2
details: http://hg.videolan.org/x265/rev/ee74adac6826
branches:
changeset: 10256:ee74adac6826
user: Min Chen <chenm003 at 163.com>
date: Wed Apr 22 21:31:11 2015 +0800
description:
testbench: support BMI2
Subject: [x265] testbench: fix testbench crash when no coeff in block
details: http://hg.videolan.org/x265/rev/9dce93005839
branches:
changeset: 10257:9dce93005839
user: Min Chen <chenm003 at 163.com>
date: Wed Apr 22 21:31:15 2015 +0800
description:
testbench: fix testbench crash when no coeff in block
Subject: [x265] asm: avx2+bmi2 version of scanPosLast, 27.6k -> 6.8k cycles
details: http://hg.videolan.org/x265/rev/e52ac44c7a49
branches:
changeset: 10258:e52ac44c7a49
user: Min Chen <chenm003 at 163.com>
date: Wed Apr 22 21:31:19 2015 +0800
description:
asm: avx2+bmi2 version of scanPosLast, 27.6k -> 6.8k cycles
Subject: [x265] testbench: fix table fault when trSize more than 8
details: http://hg.videolan.org/x265/rev/ea49b9759a4d
branches:
changeset: 10259:ea49b9759a4d
user: Min Chen <chenm003 at 163.com>
date: Wed Apr 22 21:31:23 2015 +0800
description:
testbench: fix table fault when trSize more than 8
Subject: [x265] threadpool: use Win7 version macro directly, for more clarity
details: http://hg.videolan.org/x265/rev/859daedfbb29
branches:
changeset: 10260:859daedfbb29
user: Steve Borho <steve at borho.org>
date: Wed Apr 22 11:06:24 2015 -0500
description:
threadpool: use Win7 version macro directly, for more clarity
diffstat:
source/common/dct.cpp | 4 +-
source/common/loopfilter.cpp | 23 +-
source/common/primitives.h | 8 +-
source/common/quant.cpp | 4 +-
source/common/threadpool.cpp | 8 +-
source/common/x86/asm-primitives.cpp | 45 ++-
source/common/x86/const-a.asm | 1 +
source/common/x86/ipfilter8.asm | 231 +++++++++++++++++++++
source/common/x86/ipfilter8.h | 32 +++
source/common/x86/loopfilter.asm | 159 ++++----------
source/common/x86/loopfilter.h | 2 +-
source/common/x86/pixel-a.asm | 372 +++++++++++++++++++++++++++++++++++
source/common/x86/pixel-util.h | 4 +-
source/common/x86/pixel-util8.asm | 174 ++++++++++++----
source/encoder/entropy.cpp | 2 +-
source/encoder/sao.cpp | 39 +--
source/test/pixelharness.cpp | 74 ++++--
source/test/pixelharness.h | 6 +-
source/test/testbench.cpp | 1 +
19 files changed, 913 insertions(+), 276 deletions(-)
diffs (truncated from 1678 to 300 lines):
diff -r 86268e498680 -r 859daedfbb29 source/common/dct.cpp
--- a/source/common/dct.cpp Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/dct.cpp Wed Apr 22 11:06:24 2015 -0500
@@ -752,7 +752,7 @@ void denoiseDct_c(int16_t* dctCoef, uint
}
}
-int findPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig)
+int scanPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* /*scanCG4x4*/, const int /*trSize*/)
{
memset(coeffNum, 0, MLS_GRP_NUM * sizeof(*coeffNum));
memset(coeffFlag, 0, MLS_GRP_NUM * sizeof(*coeffFlag));
@@ -848,7 +848,7 @@ void setupDCTPrimitives_c(EncoderPrimiti
p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
- p.findPosLast = findPosLast_c;
+ p.scanPosLast = scanPosLast_c;
p.findPosFirstLast = findPosFirstLast_c;
}
}
diff -r 86268e498680 -r 859daedfbb29 source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/loopfilter.cpp Wed Apr 22 11:06:24 2015 -0500
@@ -122,25 +122,6 @@ void processSaoCUE3(pixel *rec, int8_t *
}
}
-void processSaoCUE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff)
-{
- int8_t signDown;
- int8_t edgeType;
-
- for (int y = 0; y < 2; y++)
- {
- for (int x = startX + 1; x < endX; x++)
- {
- signDown = signOf(rec[x] - rec[x + stride]);
- edgeType = signDown + upBuff1[x] + 2;
- upBuff1[x - 1] = -signDown;
- rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
- }
- upBuff1[endX - 1] = upBuff[y];
- rec += stride + 1;
- }
-}
-
void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
{
#define SAO_BO_BITS 5
@@ -164,8 +145,8 @@ void setupLoopFilterPrimitives_c(Encoder
p.saoCuOrgE1 = processSaoCUE1;
p.saoCuOrgE1_2Rows = processSaoCUE1_2Rows;
p.saoCuOrgE2 = processSaoCUE2;
- p.saoCuOrgE3 = processSaoCUE3;
- p.saoCuOrgE3_2Rows = processSaoCUE3_2Rows;
+ p.saoCuOrgE3[0] = processSaoCUE3;
+ p.saoCuOrgE3[1] = processSaoCUE3;
p.saoCuOrgB0 = processSaoCUB0;
p.sign = calSign;
}
diff -r 86268e498680 -r 859daedfbb29 source/common/primitives.h
--- a/source/common/primitives.h Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/primitives.h Wed Apr 22 11:06:24 2015 -0500
@@ -172,7 +172,6 @@ typedef void (*saoCuOrgE0_t)(pixel* rec,
typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
-typedef void (*saoCuOrgE3_2Rows_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff);
typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
@@ -180,7 +179,7 @@ typedef void (*planecopy_sp_t) (const ui
typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
-typedef int (*findPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
+typedef int (*scanPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
/* Function pointers to optimized encoder primitives. Each pointer can reference
@@ -278,8 +277,7 @@ struct EncoderPrimitives
saoCuOrgE0_t saoCuOrgE0;
saoCuOrgE1_t saoCuOrgE1, saoCuOrgE1_2Rows;
saoCuOrgE2_t saoCuOrgE2;
- saoCuOrgE3_t saoCuOrgE3;
- saoCuOrgE3_2Rows_t saoCuOrgE3_2Rows;
+ saoCuOrgE3_t saoCuOrgE3[2];
saoCuOrgB0_t saoCuOrgB0;
downscale_t frameInitLowres;
@@ -293,7 +291,7 @@ struct EncoderPrimitives
weightp_pp_t weight_pp;
- findPosLast_t findPosLast;
+ scanPosLast_t scanPosLast;
findPosFirstLast_t findPosFirstLast;
/* There is one set of chroma primitives per color space. An encoder will
diff -r 86268e498680 -r 859daedfbb29 source/common/quant.cpp
--- a/source/common/quant.cpp Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/quant.cpp Wed Apr 22 11:06:24 2015 -0500
@@ -580,12 +580,12 @@ uint32_t Quant::rdoQuant(const CUData& c
uint16_t coeffFlag[MLS_GRP_NUM]; // bit mask map for non-zero coeff
#if CHECKED_BUILD || _DEBUG
- // clean output buffer, the asm version of findPosLast Never output anything after latest non-zero coeff group
+ // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group
memset(coeffNum, 0, sizeof(coeffNum));
memset(coeffSign, 0, sizeof(coeffNum));
memset(coeffFlag, 0, sizeof(coeffNum));
#endif
- const int lastScanPos = primitives.findPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig);
+ const int lastScanPos = primitives.scanPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize);
const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE);
diff -r 86268e498680 -r 859daedfbb29 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/threadpool.cpp Wed Apr 22 11:06:24 2015 -0500
@@ -232,7 +232,7 @@ ThreadPool* ThreadPool::allocThreadPools
int cpuCount = getCpuCount();
bool bNumaSupport = false;
-#if _WIN32_WINNT >= 0x0601
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
bNumaSupport = true;
#elif HAVE_LIBNUMA
bNumaSupport = numa_available() >= 0;
@@ -241,7 +241,7 @@ ThreadPool* ThreadPool::allocThreadPools
for (int i = 0; i < cpuCount; i++)
{
-#if _WIN32_WINNT >= 0x0601
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
UCHAR node;
if (GetNumaProcessorNode((UCHAR)i, &node))
cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++;
@@ -408,7 +408,7 @@ void ThreadPool::setCurrentThreadAffinit
/* static */
void ThreadPool::setThreadNodeAffinity(int numaNode)
{
-#if _WIN32_WINNT >= 0x0601
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
GROUP_AFFINITY groupAffinity;
if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
{
@@ -433,7 +433,7 @@ void ThreadPool::setThreadNodeAffinity(i
/* static */
int ThreadPool::getNumaNodeCount()
{
-#if _WIN32_WINNT >= 0x0601
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
ULONG num = 1;
if (GetNumaHighestNodeNumber(&num))
num++;
diff -r 86268e498680 -r 859daedfbb29 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Wed Apr 22 11:06:24 2015 -0500
@@ -801,7 +801,7 @@ void setupAssemblyPrimitives(EncoderPrim
#endif
#if X86_64
- //p.findPosLast = x265_findPosLast_x64;
+ p.scanPosLast = x265_scanPosLast_x64;
#endif
if (cpuMask & X265_CPU_SSE2)
@@ -1196,6 +1196,7 @@ void setupAssemblyPrimitives(EncoderPrim
p.dequant_normal = x265_dequant_normal_avx2;
p.scale1D_128to64 = x265_scale1D_128to64_avx2;
+ p.scale2D_64to32 = x265_scale2D_64to32_avx2;
// p.weight_pp = x265_weight_pp_avx2; fails tests
p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2;
@@ -1268,8 +1269,8 @@ void setupAssemblyPrimitives(EncoderPrim
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = x265_filterPixelToShort_32x48_avx2;
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = x265_filterPixelToShort_32x64_avx2;
- if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
- p.findPosLast = x265_findPosLast_x64_bmi2;
+ if (cpuMask & X265_CPU_BMI2)
+ p.scanPosLast = x265_scanPosLast_avx2_bmi2;
}
}
#else // if HIGH_BIT_DEPTH
@@ -1277,7 +1278,7 @@ void setupAssemblyPrimitives(EncoderPrim
void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // 8bpp
{
#if X86_64
- //p.findPosLast = x265_findPosLast_x64;
+ p.scanPosLast = x265_scanPosLast_x64;
#endif
if (cpuMask & X265_CPU_SSE2)
@@ -1407,18 +1408,9 @@ void setupAssemblyPrimitives(EncoderPrim
}
if (cpuMask & X265_CPU_SSE3)
{
- p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hpp = x265_interp_4tap_horiz_pp_2x4_sse3;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hpp = x265_interp_4tap_horiz_pp_2x8_sse3;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp = x265_interp_4tap_horiz_pp_4x2_sse3;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hpp = x265_interp_4tap_horiz_pp_4x4_sse3;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp = x265_interp_4tap_horiz_pp_4x8_sse3;
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp = x265_interp_4tap_horiz_pp_4x16_sse3;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hpp = x265_interp_4tap_horiz_pp_2x8_sse3;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hpp = x265_interp_4tap_horiz_pp_2x16_sse3;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hpp = x265_interp_4tap_horiz_pp_4x4_sse3;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hpp = x265_interp_4tap_horiz_pp_4x8_sse3;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hpp = x265_interp_4tap_horiz_pp_4x16_sse3;
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hpp = x265_interp_4tap_horiz_pp_4x32_sse3;
+ ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+ ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+ ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
}
if (cpuMask & X265_CPU_SSSE3)
{
@@ -1519,8 +1511,8 @@ void setupAssemblyPrimitives(EncoderPrim
p.saoCuOrgE1 = x265_saoCuOrgE1_sse4;
p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_sse4;
p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
- p.saoCuOrgE3 = x265_saoCuOrgE3_sse4;
- p.saoCuOrgE3_2Rows = x265_saoCuOrgE3_2Rows_sse4;
+ p.saoCuOrgE3[0] = x265_saoCuOrgE3_sse4;
+ p.saoCuOrgE3[1] = x265_saoCuOrgE3_sse4;
p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
LUMA_ADDAVG(sse4);
@@ -1728,7 +1720,8 @@ void setupAssemblyPrimitives(EncoderPrim
p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
- p.saoCuOrgE3 = x265_saoCuOrgE3_avx2;
+ p.saoCuOrgE3[0] = x265_saoCuOrgE3_avx2;
+ p.saoCuOrgE3[1] = x265_saoCuOrgE3_32_avx2;
p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
@@ -1846,11 +1839,21 @@ void setupAssemblyPrimitives(EncoderPrim
p.pu[LUMA_8x16].satd = x265_pixel_satd_8x16_avx2;
p.pu[LUMA_8x8].satd = x265_pixel_satd_8x8_avx2;
+ p.pu[LUMA_16x4].satd = x265_pixel_satd_16x4_avx2;
+ p.pu[LUMA_16x12].satd = x265_pixel_satd_16x12_avx2;
+ p.pu[LUMA_16x32].satd = x265_pixel_satd_16x32_avx2;
+ p.pu[LUMA_16x64].satd = x265_pixel_satd_16x64_avx2;
+
p.pu[LUMA_32x8].satd = x265_pixel_satd_32x8_avx2;
p.pu[LUMA_32x16].satd = x265_pixel_satd_32x16_avx2;
p.pu[LUMA_32x24].satd = x265_pixel_satd_32x24_avx2;
p.pu[LUMA_32x32].satd = x265_pixel_satd_32x32_avx2;
p.pu[LUMA_32x64].satd = x265_pixel_satd_32x64_avx2;
+ p.pu[LUMA_48x64].satd = x265_pixel_satd_48x64_avx2;
+ p.pu[LUMA_64x16].satd = x265_pixel_satd_64x16_avx2;
+ p.pu[LUMA_64x32].satd = x265_pixel_satd_64x32_avx2;
+ p.pu[LUMA_64x48].satd = x265_pixel_satd_64x48_avx2;
+ p.pu[LUMA_64x64].satd = x265_pixel_satd_64x64_avx2;
p.pu[LUMA_32x8].sad = x265_pixel_sad_32x8_avx2;
p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_avx2;
@@ -2400,8 +2403,8 @@ void setupAssemblyPrimitives(EncoderPrim
p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = x265_interp_4tap_horiz_ps_32x24_avx2;
p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = x265_interp_4tap_horiz_ps_32x8_avx2;
- if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
- p.findPosLast = x265_findPosLast_x64_bmi2;
+ if (cpuMask & X265_CPU_BMI2)
+ p.scanPosLast = x265_scanPosLast_avx2_bmi2;
}
#endif
}
diff -r 86268e498680 -r 859daedfbb29 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/const-a.asm Wed Apr 22 11:06:24 2015 -0500
@@ -37,6 +37,7 @@ const pb_2, times 32 db
const pb_3, times 16 db 3
const pb_4, times 32 db 4
const pb_8, times 32 db 8
+const pb_15, times 32 db 15
const pb_16, times 32 db 16
const pb_32, times 32 db 32
const pb_64, times 32 db 64
diff -r 86268e498680 -r 859daedfbb29 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/ipfilter8.asm Wed Apr 22 11:06:24 2015 -0500
@@ -594,6 +594,237 @@ cglobal interp_4tap_horiz_pp_4x32, 4, 6,
mov [dstq + dststrideq], r4w
%endmacro
+%macro FILTER_H4_w6_sse2 0
+ pxor m4, m4
+ movh m0, [srcq - 1]
+ movh m5, [srcq]
+ punpckldq m0, m5
+ movhlps m2, m0
+ punpcklbw m0, m4
+ punpcklbw m2, m4
+ movd m1, [srcq + 1]
More information about the x265-commits
mailing list