[x265-commits] [x265] asm: AVX2 version cvt32to16_shl

Tue Aug 12 07:41:06 CEST 2014

details:   http://hg.videolan.org/x265/rev/23d58a1819c7
branches:  
changeset: 7762:23d58a1819c7
user:      Min Chen <chenm003 at 163.com>
date:      Mon Aug 11 16:54:09 2014 -0700
description:
asm: AVX2 version cvt32to16_shl
Subject: [x265] replace g_convertToBit[] to g_log2Size[] const table

details:   http://hg.videolan.org/x265/rev/945e071f491f
branches:  
changeset: 7763:945e071f491f
user:      Satoshi Nakagawa <nakagawa424 at oki.com>
date:      Tue Aug 12 12:28:00 2014 +0900
description:
replace g_convertToBit[] to g_log2Size[] const table

diffstat:

 source/Lib/TLibCommon/TComDataCU.cpp |    4 +-
 source/Lib/TLibCommon/TComRom.cpp    |   19 +--
 source/Lib/TLibCommon/TComRom.h      |    2 +-
 source/common/param.cpp              |    9 +-
 source/common/x86/asm-primitives.cpp |    4 +
 source/common/x86/blockcopy8.asm     |  166 +++++++++++++++++++++++++++++++++++
 source/common/x86/blockcopy8.h       |    4 +
 source/encoder/encoder.cpp           |    4 +-
 8 files changed, 192 insertions(+), 20 deletions(-)

diffs (truncated from 341 to 300 lines):

diff -r 2bdcfcc1bb33 -r 945e071f491f source/Lib/TLibCommon/TComDataCU.cpp

--- a/source/Lib/TLibCommon/TComDataCU.cpp	Sun Aug 10 17:22:08 2014 +0900
+++ b/source/Lib/TLibCommon/TComDataCU.cpp	Tue Aug 12 12:28:00 2014 +0900
@@ -141,8 +141,8 @@ void TComDataCU::create(TComDataCU *cu, 
 
     uint32_t tmp = 4 * AMVP_DECIMATION_FACTOR / unitSize;
     tmp = tmp * tmp;
-    X265_CHECK(tmp == (1 << (g_convertToBit[tmp] + 2)), "unexpected pixel count\n");
-    tmp = g_convertToBit[tmp] + 2;
+    X265_CHECK(tmp == (1 << (g_log2Size[tmp])), "unexpected pixel count\n");
+    tmp = g_log2Size[tmp];
     m_unitMask = ~((1 << tmp) - 1);
 
     uint32_t sizeL = cuSize * cuSize;
diff -r 2bdcfcc1bb33 -r 945e071f491f source/Lib/TLibCommon/TComRom.cpp
--- a/source/Lib/TLibCommon/TComRom.cpp	Sun Aug 10 17:22:08 2014 +0900
+++ b/source/Lib/TLibCommon/TComRom.cpp	Tue Aug 12 12:28:00 2014 +0900
@@ -99,16 +99,6 @@ void initROM()
 {
     if (ATOMIC_CAS32(&initialized, 0, 1) == 1)
         return;
-
-    int i, c;
-
-    memset(g_convertToBit, -1, sizeof(g_convertToBit));
-    c = 0;
-    for (i = 4; i <= MAX_CU_SIZE; i *= 2)
-    {
-        g_convertToBit[i] = c;
-        c++;
-    }
 }
 
 void destroyROM()
@@ -300,7 +290,14 @@ const uint8_t g_chromaScale[chromaQPMapp
 const uint8_t g_chroma422IntraAngleMappingTable[36] =
 { 0, 1, 2, 2, 2, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31, DM_CHROMA_IDX };
 
-uint8_t g_convertToBit[MAX_CU_SIZE + 1];
+const uint8_t g_log2Size[MAX_CU_SIZE + 1] =
+{
+    0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    6
+};
 
 // ====================================================================================================================
 // Scanning order & context model mapping
diff -r 2bdcfcc1bb33 -r 945e071f491f source/Lib/TLibCommon/TComRom.h
--- a/source/Lib/TLibCommon/TComRom.h	Sun Aug 10 17:22:08 2014 +0900
+++ b/source/Lib/TLibCommon/TComRom.h	Tue Aug 12 12:28:00 2014 +0900
@@ -134,7 +134,7 @@ extern const uint16_t g_scan4x4[NUM_SCAN
 extern const uint8_t g_minInGroup[10];
 extern const uint8_t g_goRiceRange[5]; // maximum value coded with Rice codes
 
-extern uint8_t g_convertToBit[MAX_CU_SIZE + 1]; // from width to log2(width)-2
+extern const uint8_t g_log2Size[MAX_CU_SIZE + 1]; // from size to log2(size)
 
 // Map Luma samples to chroma samples
 extern const int g_winUnitX[MAX_CHROMA_FORMAT_IDC + 1];
diff -r 2bdcfcc1bb33 -r 945e071f491f source/common/param.cpp
--- a/source/common/param.cpp	Sun Aug 10 17:22:08 2014 +0900
+++ b/source/common/param.cpp	Tue Aug 12 12:28:00 2014 +0900
@@ -861,8 +861,8 @@ int x265_check_params(x265_param *param)
     if (check_failed == 1)
         return check_failed;
 
-    uint32_t maxCUDepth = (uint32_t)g_convertToBit[param->maxCUSize];
-    uint32_t maxLog2CUSize = maxCUDepth + 2;
+    uint32_t maxLog2CUSize = (uint32_t)g_log2Size[param->maxCUSize];
+    uint32_t maxCUDepth = maxLog2CUSize - 2;
     uint32_t tuQTMaxLog2Size = maxLog2CUSize - 1;
     uint32_t tuQTMinLog2Size = 2; //log2(4)
 
@@ -1041,7 +1041,8 @@ void x265_param_apply_fastfirstpass(x265
 
 int x265_set_globals(x265_param *param)
 {
-    uint32_t maxCUDepth = (uint32_t)g_convertToBit[param->maxCUSize];
+    uint32_t maxLog2CUSize = (uint32_t)g_log2Size[param->maxCUSize];
+    uint32_t maxCUDepth = maxLog2CUSize - 2;
     uint32_t tuQTMinLog2Size = 2; //log2(4)
 
     static int once /* = 0 */;
@@ -1058,7 +1059,7 @@ int x265_set_globals(x265_param *param)
     {
         // set max CU width & height
         g_maxCUSize = param->maxCUSize;
-        g_maxLog2CUSize = maxCUDepth + 2;
+        g_maxLog2CUSize = maxLog2CUSize;
 
         // compute actual CU depth with respect to config depth and max transform size
         g_addCUDepth = g_maxLog2CUSize - maxCUDepth - tuQTMinLog2Size;
diff -r 2bdcfcc1bb33 -r 945e071f491f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Sun Aug 10 17:22:08 2014 +0900
+++ b/source/common/x86/asm-primitives.cpp	Tue Aug 12 12:28:00 2014 +0900
@@ -1708,6 +1708,10 @@ void Setup_Assembly_Primitives(EncoderPr
         p.cvt16to32_cnt[BLOCK_8x8] = x265_cvt16to32_cnt_8_avx2;
         p.cvt16to32_cnt[BLOCK_16x16] = x265_cvt16to32_cnt_16_avx2;
         p.cvt16to32_cnt[BLOCK_32x32] = x265_cvt16to32_cnt_32_avx2;
+        p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
+        p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
+        p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
+        p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
     }
 #endif // if HIGH_BIT_DEPTH
 }
diff -r 2bdcfcc1bb33 -r 945e071f491f source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Sun Aug 10 17:22:08 2014 +0900
+++ b/source/common/x86/blockcopy8.asm	Tue Aug 12 12:28:00 2014 +0900
@@ -3656,6 +3656,25 @@ cglobal cvt32to16_shl_4, 3,3,5
     RET
 
 
+INIT_YMM avx2
+cglobal cvt32to16_shl_4, 3,3,3
+    add         r2d, r2d
+    movd        xm0, r3m
+
+    ; Row 0-3
+    movu        m1, [r1 + 0 * mmsize]
+    movu        m2, [r1 + 1 * mmsize]
+    packssdw    m1, m2
+    psllw       m1, xm0
+    vextracti128 xm0, m1, 1
+    movq        [r0], xm1
+    movq        [r0 + r2], xm0
+    lea         r0, [r0 + r2 * 2]
+    movhps      [r0], xm1
+    movhps      [r0 + r2], xm0
+    RET
+
+
 ;--------------------------------------------------------------------------------------
 ; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
 ;--------------------------------------------------------------------------------------
@@ -3698,6 +3717,54 @@ cglobal cvt32to16_shl_8, 3,5,5
     RET
 
 
+INIT_YMM avx2
+cglobal cvt32to16_shl_8, 3,4,3
+    add         r2d, r2d
+    movd        xm0, r3m
+    lea         r3, [r2 * 3]
+
+    ; Row 0-1
+    movu        xm1, [r1 + 0 * mmsize]
+    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
+    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
+    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
+    packssdw    m1, m2
+    psllw       m1, xm0
+    movu        [r0], xm1
+    vextracti128 [r0 + r2], m1, 1
+
+    ; Row 2-3
+    movu        xm1, [r1 + 2 * mmsize]
+    vinserti128  m1, m1, [r1 + 3 * mmsize], 1
+    movu        xm2, [r1 + 2 * mmsize + mmsize/2]
+    vinserti128  m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
+    packssdw    m1, m2
+    psllw       m1, xm0
+    movu        [r0 + r2 * 2], xm1
+    vextracti128 [r0 + r3], m1, 1
+
+    add         r1, 4 * mmsize
+    lea         r0, [r0 + r2 * 4]
+
+    ; Row 4-5
+    movu        m1, [r1 + 0 * mmsize]
+    movu        m2, [r1 + 1 * mmsize]
+    packssdw    m1, m2
+    vpermq      m1, m1, 11011000b
+    psllw       m1, xm0
+    movu        [r0], xm1
+    vextracti128 [r0 + r2], m1, 1
+
+    ; Row 6-7
+    movu        m1, [r1 + 2 * mmsize]
+    movu        m2, [r1 + 3 * mmsize]
+    packssdw    m1, m2
+    vpermq      m1, m1, 11011000b
+    psllw       m1, xm0
+    movu        [r0 + r2 * 2], xm1
+    vextracti128 [r0 + r3], m1, 1
+    RET
+
 ;--------------------------------------------------------------------------------------
 ; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
 ;--------------------------------------------------------------------------------------
@@ -3739,6 +3806,58 @@ cglobal cvt32to16_shl_16, 3,4,5
     RET
 
 
+INIT_YMM avx2
+cglobal cvt32to16_shl_16, 3,5,3
+    add         r2d, r2d
+    movd        xm0, r3m
+    mov         r3d, 16/4
+    lea         r4, [r2 * 3]
+
+.loop:
+    ; Row 0
+    movu        xm1, [r1 + 0 * mmsize]
+    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
+    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
+    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
+    packssdw    m1, m2
+    psllw       m1, xm0
+    movu        [r0], m1
+
+    ; Row 1
+    movu        xm1, [r1 + 2 * mmsize]
+    vinserti128  m1, m1, [r1 + 3 * mmsize], 1
+    movu        xm2, [r1 + 2 * mmsize + mmsize/2]
+    vinserti128  m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
+    packssdw    m1, m2
+    psllw       m1, xm0
+    movu        [r0 + r2], m1
+
+    add         r1, 4 * mmsize
+
+    ; Row 2
+    movu        xm1, [r1 + 0 * mmsize]
+    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
+    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
+    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
+    packssdw    m1, m2
+    psllw       m1, xm0
+    movu        [r0 + r2 * 2], m1
+
+    ; Row 3
+    movu        m1, [r1 + 2 * mmsize]
+    movu        m2, [r1 + 3 * mmsize]
+    packssdw    m1, m2
+    psllw       m1, xm0
+    vpermq      m1, m1, 11011000b
+    movu        [r0 + r4], m1
+
+    add         r1, 4 * mmsize
+    lea         r0, [r0 + r2 * 4]
+    dec         r3d
+    jnz        .loop
+    RET
+
+
 ;--------------------------------------------------------------------------------------
 ; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
 ;--------------------------------------------------------------------------------------
@@ -3779,6 +3898,53 @@ cglobal cvt32to16_shl_32, 3,4,5
     RET
 
 
+INIT_YMM avx2
+cglobal cvt32to16_shl_32, 3,4,5
+    add         r2d, r2d
+    movd        xm0, r3m
+    mov         r3d, 32/2
+
+.loop:
+    ; Row 0
+    movu        xm1, [r1 + 0 * mmsize]
+    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
+    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
+    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
+    movu        xm3, [r1 + 2 * mmsize]
+    vinserti128  m3, m3, [r1 + 3 * mmsize], 1
+    movu        xm4, [r1 + 2 * mmsize + mmsize/2]
+    vinserti128  m4, m4, [r1 + 3 * mmsize + mmsize/2], 1
+    packssdw    m1, m2
+    packssdw    m3, m4
+    psllw       m1, xm0
+    psllw       m3, xm0
+    movu        [r0], m1
+    movu        [r0 + mmsize], m3
+
+    add         r1, 4 * mmsize
+
+    ; Row 1
+    movu        xm1, [r1 + 0 * mmsize]
+    vinserti128  m1, m1, [r1 + 1 * mmsize], 1
+    movu        xm2, [r1 + 0 * mmsize + mmsize/2]
+    vinserti128  m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
+    movu        m3, [r1 + 2 * mmsize]
+    movu        m4, [r1 + 3 * mmsize]
+    packssdw    m1, m2
+    packssdw    m3, m4
+    psllw       m1, xm0
+    psllw       m3, xm0
+    vpermq      m3, m3, 11011000b
+    movu        [r0 + r2], m1
+    movu        [r0 + r2 + mmsize], m3
+
+    add         r1, 4 * mmsize
+    lea         r0, [r0 + r2 * 2]
+    dec         r3d
+    jnz        .loop