[x265-commits] [x265] asm: AVX2 version cvt32to16_shl
Min Chen
chenm003 at 163.com
Tue Aug 12 07:41:06 CEST 2014
details: http://hg.videolan.org/x265/rev/23d58a1819c7
branches:
changeset: 7762:23d58a1819c7
user: Min Chen <chenm003 at 163.com>
date: Mon Aug 11 16:54:09 2014 -0700
description:
asm: AVX2 version cvt32to16_shl
Subject: [x265] replace g_convertToBit[] to g_log2Size[] const table
details: http://hg.videolan.org/x265/rev/945e071f491f
branches:
changeset: 7763:945e071f491f
user: Satoshi Nakagawa <nakagawa424 at oki.com>
date: Tue Aug 12 12:28:00 2014 +0900
description:
replace g_convertToBit[] to g_log2Size[] const table
diffstat:
source/Lib/TLibCommon/TComDataCU.cpp | 4 +-
source/Lib/TLibCommon/TComRom.cpp | 19 +--
source/Lib/TLibCommon/TComRom.h | 2 +-
source/common/param.cpp | 9 +-
source/common/x86/asm-primitives.cpp | 4 +
source/common/x86/blockcopy8.asm | 166 +++++++++++++++++++++++++++++++++++
source/common/x86/blockcopy8.h | 4 +
source/encoder/encoder.cpp | 4 +-
8 files changed, 192 insertions(+), 20 deletions(-)
diffs (truncated from 341 to 300 lines):
diff -r 2bdcfcc1bb33 -r 945e071f491f source/Lib/TLibCommon/TComDataCU.cpp
--- a/source/Lib/TLibCommon/TComDataCU.cpp Sun Aug 10 17:22:08 2014 +0900
+++ b/source/Lib/TLibCommon/TComDataCU.cpp Tue Aug 12 12:28:00 2014 +0900
@@ -141,8 +141,8 @@ void TComDataCU::create(TComDataCU *cu,
uint32_t tmp = 4 * AMVP_DECIMATION_FACTOR / unitSize;
tmp = tmp * tmp;
- X265_CHECK(tmp == (1 << (g_convertToBit[tmp] + 2)), "unexpected pixel count\n");
- tmp = g_convertToBit[tmp] + 2;
+ X265_CHECK(tmp == (1 << (g_log2Size[tmp])), "unexpected pixel count\n");
+ tmp = g_log2Size[tmp];
m_unitMask = ~((1 << tmp) - 1);
uint32_t sizeL = cuSize * cuSize;
diff -r 2bdcfcc1bb33 -r 945e071f491f source/Lib/TLibCommon/TComRom.cpp
--- a/source/Lib/TLibCommon/TComRom.cpp Sun Aug 10 17:22:08 2014 +0900
+++ b/source/Lib/TLibCommon/TComRom.cpp Tue Aug 12 12:28:00 2014 +0900
@@ -99,16 +99,6 @@ void initROM()
{
if (ATOMIC_CAS32(&initialized, 0, 1) == 1)
return;
-
- int i, c;
-
- memset(g_convertToBit, -1, sizeof(g_convertToBit));
- c = 0;
- for (i = 4; i <= MAX_CU_SIZE; i *= 2)
- {
- g_convertToBit[i] = c;
- c++;
- }
}
void destroyROM()
@@ -300,7 +290,14 @@ const uint8_t g_chromaScale[chromaQPMapp
const uint8_t g_chroma422IntraAngleMappingTable[36] =
{ 0, 1, 2, 2, 2, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31, DM_CHROMA_IDX };
-uint8_t g_convertToBit[MAX_CU_SIZE + 1];
+const uint8_t g_log2Size[MAX_CU_SIZE + 1] =
+{
+ 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 6
+};
// ====================================================================================================================
// Scanning order & context model mapping
diff -r 2bdcfcc1bb33 -r 945e071f491f source/Lib/TLibCommon/TComRom.h
--- a/source/Lib/TLibCommon/TComRom.h Sun Aug 10 17:22:08 2014 +0900
+++ b/source/Lib/TLibCommon/TComRom.h Tue Aug 12 12:28:00 2014 +0900
@@ -134,7 +134,7 @@ extern const uint16_t g_scan4x4[NUM_SCAN
extern const uint8_t g_minInGroup[10];
extern const uint8_t g_goRiceRange[5]; // maximum value coded with Rice codes
-extern uint8_t g_convertToBit[MAX_CU_SIZE + 1]; // from width to log2(width)-2
+extern const uint8_t g_log2Size[MAX_CU_SIZE + 1]; // from size to log2(size)
// Map Luma samples to chroma samples
extern const int g_winUnitX[MAX_CHROMA_FORMAT_IDC + 1];
diff -r 2bdcfcc1bb33 -r 945e071f491f source/common/param.cpp
--- a/source/common/param.cpp Sun Aug 10 17:22:08 2014 +0900
+++ b/source/common/param.cpp Tue Aug 12 12:28:00 2014 +0900
@@ -861,8 +861,8 @@ int x265_check_params(x265_param *param)
if (check_failed == 1)
return check_failed;
- uint32_t maxCUDepth = (uint32_t)g_convertToBit[param->maxCUSize];
- uint32_t maxLog2CUSize = maxCUDepth + 2;
+ uint32_t maxLog2CUSize = (uint32_t)g_log2Size[param->maxCUSize];
+ uint32_t maxCUDepth = maxLog2CUSize - 2;
uint32_t tuQTMaxLog2Size = maxLog2CUSize - 1;
uint32_t tuQTMinLog2Size = 2; //log2(4)
@@ -1041,7 +1041,8 @@ void x265_param_apply_fastfirstpass(x265
int x265_set_globals(x265_param *param)
{
- uint32_t maxCUDepth = (uint32_t)g_convertToBit[param->maxCUSize];
+ uint32_t maxLog2CUSize = (uint32_t)g_log2Size[param->maxCUSize];
+ uint32_t maxCUDepth = maxLog2CUSize - 2;
uint32_t tuQTMinLog2Size = 2; //log2(4)
static int once /* = 0 */;
@@ -1058,7 +1059,7 @@ int x265_set_globals(x265_param *param)
{
// set max CU width & height
g_maxCUSize = param->maxCUSize;
- g_maxLog2CUSize = maxCUDepth + 2;
+ g_maxLog2CUSize = maxLog2CUSize;
// compute actual CU depth with respect to config depth and max transform size
g_addCUDepth = g_maxLog2CUSize - maxCUDepth - tuQTMinLog2Size;
diff -r 2bdcfcc1bb33 -r 945e071f491f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Sun Aug 10 17:22:08 2014 +0900
+++ b/source/common/x86/asm-primitives.cpp Tue Aug 12 12:28:00 2014 +0900
@@ -1708,6 +1708,10 @@ void Setup_Assembly_Primitives(EncoderPr
p.cvt16to32_cnt[BLOCK_8x8] = x265_cvt16to32_cnt_8_avx2;
p.cvt16to32_cnt[BLOCK_16x16] = x265_cvt16to32_cnt_16_avx2;
p.cvt16to32_cnt[BLOCK_32x32] = x265_cvt16to32_cnt_32_avx2;
+ p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
+ p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
+ p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
+ p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
}
#endif // if HIGH_BIT_DEPTH
}
diff -r 2bdcfcc1bb33 -r 945e071f491f source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Sun Aug 10 17:22:08 2014 +0900
+++ b/source/common/x86/blockcopy8.asm Tue Aug 12 12:28:00 2014 +0900
@@ -3656,6 +3656,25 @@ cglobal cvt32to16_shl_4, 3,3,5
RET
+INIT_YMM avx2
+cglobal cvt32to16_shl_4, 3,3,3
+ add r2d, r2d
+ movd xm0, r3m
+
+ ; Row 0-3
+ movu m1, [r1 + 0 * mmsize]
+ movu m2, [r1 + 1 * mmsize]
+ packssdw m1, m2
+ psllw m1, xm0
+ vextracti128 xm0, m1, 1
+ movq [r0], xm1
+ movq [r0 + r2], xm0
+ lea r0, [r0 + r2 * 2]
+ movhps [r0], xm1
+ movhps [r0 + r2], xm0
+ RET
+
+
;--------------------------------------------------------------------------------------
; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
;--------------------------------------------------------------------------------------
@@ -3698,6 +3717,54 @@ cglobal cvt32to16_shl_8, 3,5,5
RET
+INIT_YMM avx2
+cglobal cvt32to16_shl_8, 3,4,3
+ add r2d, r2d
+ movd xm0, r3m
+ lea r3, [r2 * 3]
+
+ ; Row 0-1
+ movu xm1, [r1 + 0 * mmsize]
+ vinserti128 m1, m1, [r1 + 1 * mmsize], 1
+ movu xm2, [r1 + 0 * mmsize + mmsize/2]
+ vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
+ packssdw m1, m2
+ psllw m1, xm0
+ movu [r0], xm1
+ vextracti128 [r0 + r2], m1, 1
+
+ ; Row 2-3
+ movu xm1, [r1 + 2 * mmsize]
+ vinserti128 m1, m1, [r1 + 3 * mmsize], 1
+ movu xm2, [r1 + 2 * mmsize + mmsize/2]
+ vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
+ packssdw m1, m2
+ psllw m1, xm0
+ movu [r0 + r2 * 2], xm1
+ vextracti128 [r0 + r3], m1, 1
+
+ add r1, 4 * mmsize
+ lea r0, [r0 + r2 * 4]
+
+ ; Row 4-5
+ movu m1, [r1 + 0 * mmsize]
+ movu m2, [r1 + 1 * mmsize]
+ packssdw m1, m2
+ vpermq m1, m1, 11011000b
+ psllw m1, xm0
+ movu [r0], xm1
+ vextracti128 [r0 + r2], m1, 1
+
+ ; Row 6-7
+ movu m1, [r1 + 2 * mmsize]
+ movu m2, [r1 + 3 * mmsize]
+ packssdw m1, m2
+ vpermq m1, m1, 11011000b
+ psllw m1, xm0
+ movu [r0 + r2 * 2], xm1
+ vextracti128 [r0 + r3], m1, 1
+ RET
+
;--------------------------------------------------------------------------------------
; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
;--------------------------------------------------------------------------------------
@@ -3739,6 +3806,58 @@ cglobal cvt32to16_shl_16, 3,4,5
RET
+INIT_YMM avx2
+cglobal cvt32to16_shl_16, 3,5,3
+ add r2d, r2d
+ movd xm0, r3m
+ mov r3d, 16/4
+ lea r4, [r2 * 3]
+
+.loop:
+ ; Row 0
+ movu xm1, [r1 + 0 * mmsize]
+ vinserti128 m1, m1, [r1 + 1 * mmsize], 1
+ movu xm2, [r1 + 0 * mmsize + mmsize/2]
+ vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
+ packssdw m1, m2
+ psllw m1, xm0
+ movu [r0], m1
+
+ ; Row 1
+ movu xm1, [r1 + 2 * mmsize]
+ vinserti128 m1, m1, [r1 + 3 * mmsize], 1
+ movu xm2, [r1 + 2 * mmsize + mmsize/2]
+ vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
+ packssdw m1, m2
+ psllw m1, xm0
+ movu [r0 + r2], m1
+
+ add r1, 4 * mmsize
+
+ ; Row 2
+ movu xm1, [r1 + 0 * mmsize]
+ vinserti128 m1, m1, [r1 + 1 * mmsize], 1
+ movu xm2, [r1 + 0 * mmsize + mmsize/2]
+ vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
+ packssdw m1, m2
+ psllw m1, xm0
+ movu [r0 + r2 * 2], m1
+
+ ; Row 3
+ movu m1, [r1 + 2 * mmsize]
+ movu m2, [r1 + 3 * mmsize]
+ packssdw m1, m2
+ psllw m1, xm0
+ vpermq m1, m1, 11011000b
+ movu [r0 + r4], m1
+
+ add r1, 4 * mmsize
+ lea r0, [r0 + r2 * 4]
+ dec r3d
+ jnz .loop
+ RET
+
+
;--------------------------------------------------------------------------------------
; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
;--------------------------------------------------------------------------------------
@@ -3779,6 +3898,53 @@ cglobal cvt32to16_shl_32, 3,4,5
RET
+INIT_YMM avx2
+cglobal cvt32to16_shl_32, 3,4,5
+ add r2d, r2d
+ movd xm0, r3m
+ mov r3d, 32/2
+
+.loop:
+ ; Row 0
+ movu xm1, [r1 + 0 * mmsize]
+ vinserti128 m1, m1, [r1 + 1 * mmsize], 1
+ movu xm2, [r1 + 0 * mmsize + mmsize/2]
+ vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
+ movu xm3, [r1 + 2 * mmsize]
+ vinserti128 m3, m3, [r1 + 3 * mmsize], 1
+ movu xm4, [r1 + 2 * mmsize + mmsize/2]
+ vinserti128 m4, m4, [r1 + 3 * mmsize + mmsize/2], 1
+ packssdw m1, m2
+ packssdw m3, m4
+ psllw m1, xm0
+ psllw m3, xm0
+ movu [r0], m1
+ movu [r0 + mmsize], m3
+
+ add r1, 4 * mmsize
+
+ ; Row 1
+ movu xm1, [r1 + 0 * mmsize]
+ vinserti128 m1, m1, [r1 + 1 * mmsize], 1
+ movu xm2, [r1 + 0 * mmsize + mmsize/2]
+ vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
+ movu m3, [r1 + 2 * mmsize]
+ movu m4, [r1 + 3 * mmsize]
+ packssdw m1, m2
+ packssdw m3, m4
+ psllw m1, xm0
+ psllw m3, xm0
+ vpermq m3, m3, 11011000b
+ movu [r0 + r2], m1
+ movu [r0 + r2 + mmsize], m3
+
+ add r1, 4 * mmsize
+ lea r0, [r0 + r2 * 2]
+ dec r3d
+ jnz .loop
More information about the x265-commits
mailing list