[x265-commits] [x265] fix build fault on WinXP

Min Chen chenm003 at 163.com
Fri Apr 24 21:22:23 CEST 2015


details:   http://hg.videolan.org/x265/rev/717dd61e2799
branches:  
changeset: 10273:717dd61e2799
user:      Min Chen <chenm003 at 163.com>
date:      Fri Apr 24 21:45:55 2015 +0800
description:
fix build fault on WinXP
Subject: [x265] rdoQuant: split coeff cost into psy and non-psy path

details:   http://hg.videolan.org/x265/rev/56ac27335f04
branches:  
changeset: 10274:56ac27335f04
user:      Min Chen <chenm003 at 163.com>
date:      Fri Apr 24 21:45:58 2015 +0800
description:
rdoQuant: split coeff cost into psy and non-psy path
Subject: [x265] rdoQuant: optimize getSigCtxInc()

details:   http://hg.videolan.org/x265/rev/3d02057c6e65
branches:  
changeset: 10275:3d02057c6e65
user:      Min Chen <chenm003 at 163.com>
date:      Fri Apr 24 21:46:01 2015 +0800
description:
rdoQuant: optimize getSigCtxInc()
Subject: [x265] asm: avx2 code for chroma vsp filter for i422

details:   http://hg.videolan.org/x265/rev/07af054786bf
branches:  
changeset: 10276:07af054786bf
user:      Sumalatha Polureddy
date:      Fri Apr 24 12:07:52 2015 +0530
description:
asm: avx2 code for chroma vsp filter for i422
Subject: [x265] asm: avx2 code for chroma vsp filter for i444

details:   http://hg.videolan.org/x265/rev/27081950e06e
branches:  
changeset: 10277:27081950e06e
user:      Sumalatha Polureddy
date:      Fri Apr 24 13:49:59 2015 +0530
description:
asm: avx2 code for chroma vsp filter for i444
Subject: [x265] asm: avx2 code for chroma vps filter for i422

details:   http://hg.videolan.org/x265/rev/b665d3a0ef4b
branches:  
changeset: 10278:b665d3a0ef4b
user:      Sumalatha Polureddy
date:      Fri Apr 24 15:32:14 2015 +0530
description:
asm: avx2 code for chroma vps filter for i422
Subject: [x265] asm: avx2 code for chroma vps filter for i444

details:   http://hg.videolan.org/x265/rev/0d7402f9ca6b
branches:  
changeset: 10279:0d7402f9ca6b
user:      Sumalatha Polureddy
date:      Fri Apr 24 16:44:32 2015 +0530
description:
asm: avx2 code for chroma vps filter for i444
Subject: [x265] asm: avx2 code for sign primitive: improve 204c->114c

details:   http://hg.videolan.org/x265/rev/0380e8bb5e95
branches:  
changeset: 10280:0380e8bb5e95
user:      Divya Manivannan <divya at multicorewareinc.com>
date:      Fri Apr 24 13:47:58 2015 +0530
description:
asm: avx2 code for sign primitive: improve 204c->114c
Subject: [x265] asm: add pixel restoration part in saoCuOrgE2 primitive

details:   http://hg.videolan.org/x265/rev/b7c98159982a
branches:  
changeset: 10281:b7c98159982a
user:      Divya Manivannan <divya at multicorewareinc.com>
date:      Fri Apr 24 14:10:55 2015 +0530
description:
asm: add pixel restoration part in saoCuOrgE2 primitive
Subject: [x265] asm: avx2 10bit code for add_ps[16x16],[32x32],[64x64]

details:   http://hg.videolan.org/x265/rev/9356573c022a
branches:  
changeset: 10282:9356573c022a
user:      Rajesh Paulraj<rajesh at multicorewareinc.com>
date:      Fri Apr 24 19:20:36 2015 +0530
description:
asm: avx2 10bit code for add_ps[16x16],[32x32],[64x64]

     add_ps[16x16](19.29x), add_ps[32x32](22.42x), add_ps[64x64](26.69x)
Subject: [x265] asm: avx2 10bit code for add_ps for chroma sizes 16xN, 32xN, reuse luma code

details:   http://hg.videolan.org/x265/rev/55eedcef3708
branches:  
changeset: 10283:55eedcef3708
user:      Rajesh Paulraj<rajesh at multicorewareinc.com>
date:      Fri Apr 24 19:22:48 2015 +0530
description:
asm: avx2 10bit code for add_ps for chroma sizes 16xN, 32xN, reuse luma code

diffstat:

 source/CMakeLists.txt                |    2 +-
 source/common/quant.cpp              |  114 +++++++++++--
 source/common/x86/asm-primitives.cpp |   79 ++++++++++
 source/common/x86/loopfilter.asm     |  146 +++++++++++++-----
 source/common/x86/loopfilter.h       |    1 +
 source/common/x86/pixeladd8.asm      |  270 +++++++++++++++++++++++++++++++++-
 source/encoder/sao.cpp               |   17 +--
 source/test/pixelharness.cpp         |    2 +-
 8 files changed, 537 insertions(+), 94 deletions(-)

diffs (truncated from 865 to 300 lines):

diff -r a35fafa25df2 -r 55eedcef3708 source/CMakeLists.txt
--- a/source/CMakeLists.txt	Thu Apr 23 12:32:49 2015 -0500
+++ b/source/CMakeLists.txt	Fri Apr 24 19:22:48 2015 +0530
@@ -305,7 +305,7 @@ if (WIN32)
     if(WINXP_SUPPORT)
         # force use of workarounds for CONDITION_VARIABLE and atomic
         # intrinsics introduced after XP
-        add_definitions(-D_WIN32_WINNT=_WIN32_WINNT_WINXP)
+        add_definitions(-D_WIN32_WINNT=_WIN32_WINNT_WINXP -D_WIN32_WINNT_WIN7=0x0601)
     else(WINXP_SUPPORT)
         # default to targeting Windows 7 for the NUMA APIs
         add_definitions(-D_WIN32_WINNT=_WIN32_WINNT_WIN7)
diff -r a35fafa25df2 -r 55eedcef3708 source/common/quant.cpp
--- a/source/common/quant.cpp	Thu Apr 23 12:32:49 2015 -0500
+++ b/source/common/quant.cpp	Fri Apr 24 19:22:48 2015 +0530
@@ -659,6 +659,45 @@ uint32_t Quant::rdoQuant(const CUData& c
         }
     }
 
+    static const uint8_t table_cnt[5][SCAN_SET_SIZE] =
+    {
+        // patternSigCtx = 0
+        {
+            2, 1, 1, 0,
+            1, 1, 0, 0,
+            1, 0, 0, 0,
+            0, 0, 0, 0,
+        },
+        // patternSigCtx = 1
+        {
+            2, 2, 2, 2,
+            1, 1, 1, 1,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+        },
+        // patternSigCtx = 2
+        {
+            2, 1, 0, 0,
+            2, 1, 0, 0,
+            2, 1, 0, 0,
+            2, 1, 0, 0,
+        },
+        // patternSigCtx = 3
+        {
+            2, 2, 2, 2,
+            2, 2, 2, 2,
+            2, 2, 2, 2,
+            2, 2, 2, 2,
+        },
+        // 4x4
+        {
+            0, 1, 4, 5,
+            2, 3, 4, 5,
+            6, 6, 8, 8,
+            7, 7, 8, 8
+        }
+    };
+
     /* iterate over coding groups in reverse scan order */
     for (int cgScanPos = cgLastScanPos; cgScanPos >= 0; cgScanPos--)
     {
@@ -668,6 +707,7 @@ uint32_t Quant::rdoQuant(const CUData& c
         const uint32_t cgPosX   = cgBlkPos - (cgPosY << codeParams.log2TrSizeCG);
         const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
         const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
+        const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0);
 
         if (c1 == 0)
             ctxSet++;
@@ -676,32 +716,63 @@ uint32_t Quant::rdoQuant(const CUData& c
         if (cgScanPos && (coeffNum[cgScanPos] == 0))
         {
             // TODO: does we need zero-coeff cost?
-            for (int scanPosinCG = 0; scanPosinCG < SCAN_SET_SIZE; scanPosinCG++)
+            const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
+            uint32_t blkPos = codeParams.scan[scanPosBase];
+
+            if (usePsyMask)
             {
-                scanPos              = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
-                uint32_t blkPos      = codeParams.scan[scanPos];
+                // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
+                for (int y = 0; y < MLS_CG_SIZE; y++)
+                {
+                    for (int x = 0; x < MLS_CG_SIZE; x++)
+                    {
+                        int signCoef         = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
+                        int predictedCoef    = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
 
-                // TODO: get 16 of ctxSig
-                const uint32_t ctxSig = getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext);
+                        costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
 
-                /* set default costs to uncoded costs */
-                int signCoef         = m_resiDctCoeff[blkPos];            /* pre-quantization DCT coeff */
-                int predictedCoef    = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
+                        /* when no residual coefficient is coded, predicted coef == recon coef */
+                        costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
 
-                /* cost of not coding this coefficient (all distortion, no signal bits) */
-                costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits;
+                        totalUncodedCost += costUncoded[blkPos + x];
+                        totalRdCost += costUncoded[blkPos + x];
 
-                X265_CHECK(scanPos > 0, "scanPos failure\n");
-                if (usePsyMask)
-                    /* when no residual coefficient is coded, predicted coef == recon coef */
-                    costUncoded[blkPos] -= PSYVALUE(predictedCoef);
+                        const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
+                        const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
+                        X265_CHECK(trSize > 4, "trSize check failure\n");
+                        X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
 
-                totalUncodedCost += costUncoded[blkPos];
+                        costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[ctxSig][0]);
+                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
+                        sigRateDelta[blkPos + x] = estBitsSbac.significantBits[ctxSig][1] - estBitsSbac.significantBits[ctxSig][0];
+                    }
+                    blkPos += trSize;
+                }
+            }
+            else
+            {
+                // non-psy path
+                for (int y = 0; y < MLS_CG_SIZE; y++)
+                {
+                    for (int x = 0; x < MLS_CG_SIZE; x++)
+                    {
+                        int signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
+                        costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
 
-                costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[ctxSig][0]);
-                costCoeff[scanPos] = costUncoded[blkPos];
-                sigRateDelta[blkPos] = estBitsSbac.significantBits[ctxSig][1] - estBitsSbac.significantBits[ctxSig][0];
-                totalRdCost += costCoeff[scanPos];
+                        totalUncodedCost += costUncoded[blkPos + x];
+                        totalRdCost += costUncoded[blkPos + x];
+
+                        const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
+                        const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
+                        X265_CHECK(trSize > 4, "trSize check failure\n");
+                        X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
+
+                        costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[ctxSig][0]);
+                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
+                        sigRateDelta[blkPos + x] = estBitsSbac.significantBits[ctxSig][1] - estBitsSbac.significantBits[ctxSig][0];
+                    }
+                    blkPos += trSize;
+                }
             }
 
             /* there were no coded coefficients in this coefficient group */
@@ -745,6 +816,8 @@ uint32_t Quant::rdoQuant(const CUData& c
 
             // coefficient level estimation
             const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1];
+            const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
+            X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
 
             // before find lastest non-zero coeff
             if (scanPos > (uint32_t)lastScanPos)
@@ -761,8 +834,6 @@ uint32_t Quant::rdoQuant(const CUData& c
             else if (!(subFlagMask & 1))
             {
                 // fast zero coeff path
-                const uint32_t ctxSig = getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext);
-
                 /* set default costs to uncoded costs */
                 costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[ctxSig][0]);
                 costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
@@ -794,7 +865,6 @@ uint32_t Quant::rdoQuant(const CUData& c
                     sigRateDelta[blkPos] = 0;
                 else
                 {
-                    const uint32_t ctxSig = getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext);
                     if (maxAbsLevel < 3)
                     {
                         /* set default costs to uncoded costs */
diff -r a35fafa25df2 -r 55eedcef3708 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Apr 23 12:32:49 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Fri Apr 24 19:22:48 2015 +0530
@@ -1223,6 +1223,14 @@ void setupAssemblyPrimitives(EncoderPrim
         ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2);
         ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
 
+        p.cu[BLOCK_16x16].add_ps = x265_pixel_add_ps_16x16_avx2;
+        p.cu[BLOCK_32x32].add_ps = x265_pixel_add_ps_32x32_avx2;
+        p.cu[BLOCK_64x64].add_ps = x265_pixel_add_ps_64x64_avx2;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = x265_pixel_add_ps_16x16_avx2;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = x265_pixel_add_ps_32x32_avx2;
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = x265_pixel_add_ps_16x32_avx2;
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = x265_pixel_add_ps_32x64_avx2;
+
         p.cu[BLOCK_16x16].sub_ps = x265_pixel_sub_ps_16x16_avx2;
         p.cu[BLOCK_32x32].sub_ps = x265_pixel_sub_ps_32x32_avx2;
         p.cu[BLOCK_64x64].sub_ps = x265_pixel_sub_ps_64x64_avx2;
@@ -1720,6 +1728,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.saoCuOrgE3[0] = x265_saoCuOrgE3_avx2;
         p.saoCuOrgE3[1] = x265_saoCuOrgE3_32_avx2;
         p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
+        p.sign = x265_calSign_avx2;
 
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
         p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
@@ -2434,6 +2443,76 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = x265_interp_4tap_horiz_ps_32x24_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = x265_interp_4tap_horiz_ps_32x8_avx2;
 
+        //i422 for chroma_vsp
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vsp = x265_interp_4tap_vert_sp_4x8_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vsp = x265_interp_4tap_vert_sp_8x16_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vsp = x265_interp_4tap_vert_sp_16x32_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vsp = x265_interp_4tap_vert_sp_4x4_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_vsp = x265_interp_4tap_vert_sp_2x8_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vsp = x265_interp_4tap_vert_sp_8x8_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vsp = x265_interp_4tap_vert_sp_4x16_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vsp = x265_interp_4tap_vert_sp_16x16_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vsp = x265_interp_4tap_vert_sp_8x32_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vsp = x265_interp_4tap_vert_sp_32x32_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vsp = x265_interp_4tap_vert_sp_8x4_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vsp = x265_interp_4tap_vert_sp_16x8_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vsp = x265_interp_4tap_vert_sp_32x16_avx2;
+
+        //i444 for chroma_vsp
+        p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vsp = x265_interp_4tap_vert_sp_4x4_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vsp = x265_interp_4tap_vert_sp_8x8_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vsp = x265_interp_4tap_vert_sp_16x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = x265_interp_4tap_vert_sp_32x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vsp = x265_interp_4tap_vert_sp_8x4_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vsp = x265_interp_4tap_vert_sp_4x8_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vsp = x265_interp_4tap_vert_sp_16x8_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vsp = x265_interp_4tap_vert_sp_8x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vsp = x265_interp_4tap_vert_sp_32x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vsp = x265_interp_4tap_vert_sp_16x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vsp = x265_interp_4tap_vert_sp_16x12_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vsp = x265_interp_4tap_vert_sp_12x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vsp = x265_interp_4tap_vert_sp_16x4_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vsp = x265_interp_4tap_vert_sp_4x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vsp = x265_interp_4tap_vert_sp_32x24_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = x265_interp_4tap_vert_sp_24x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vsp = x265_interp_4tap_vert_sp_32x8_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vsp = x265_interp_4tap_vert_sp_8x32_avx2;
+
+        //i422 for chroma_vps
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vps = x265_interp_4tap_vert_ps_4x8_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vps = x265_interp_4tap_vert_ps_8x16_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vps = x265_interp_4tap_vert_ps_16x32_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_vps = x265_interp_4tap_vert_ps_2x8_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vps = x265_interp_4tap_vert_ps_8x8_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vps = x265_interp_4tap_vert_ps_4x16_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vps = x265_interp_4tap_vert_ps_16x16_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vps = x265_interp_4tap_vert_ps_8x32_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = x265_interp_4tap_vert_ps_32x32_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vps = x265_interp_4tap_vert_ps_8x4_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vps = x265_interp_4tap_vert_ps_16x8_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = x265_interp_4tap_vert_ps_32x16_avx2;
+
+        //i444 for chroma_vps
+        p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vps = x265_interp_4tap_vert_ps_8x8_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vps = x265_interp_4tap_vert_ps_16x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = x265_interp_4tap_vert_ps_32x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vps = x265_interp_4tap_vert_ps_8x4_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vps = x265_interp_4tap_vert_ps_4x8_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vps = x265_interp_4tap_vert_ps_16x8_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vps = x265_interp_4tap_vert_ps_8x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = x265_interp_4tap_vert_ps_32x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vps = x265_interp_4tap_vert_ps_16x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vps = x265_interp_4tap_vert_ps_16x12_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vps = x265_interp_4tap_vert_ps_12x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vps = x265_interp_4tap_vert_ps_16x4_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vps = x265_interp_4tap_vert_ps_4x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = x265_interp_4tap_vert_ps_32x24_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vps = x265_interp_4tap_vert_ps_24x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = x265_interp_4tap_vert_ps_32x8_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vps = x265_interp_4tap_vert_ps_8x32_avx2;
+
         if (cpuMask & X265_CPU_BMI2)
             p.scanPosLast = x265_scanPosLast_avx2_bmi2;
     }
diff -r a35fafa25df2 -r 55eedcef3708 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Thu Apr 23 12:32:49 2015 -0500
+++ b/source/common/x86/loopfilter.asm	Fri Apr 24 19:22:48 2015 +0530
@@ -30,6 +30,8 @@
 SECTION_RODATA 32
 pb_31:      times 32 db 31
 pb_15:      times 32 db 15
+pb_movemask_32:  times 32 db 0x00
+                 times 32 db 0xFF
 
 SECTION .text
 cextern pb_1
@@ -404,60 +406,66 @@ cglobal saoCuOrgE1_2Rows, 3, 5, 7, pRec,
 ; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride)
 ;======================================================================================================================================================
 INIT_XMM sse4
-cglobal saoCuOrgE2, 5, 7, 8, rec, bufft, buff1, offsetEo, lcuWidth
-
-    mov         r6,    16


More information about the x265-commits mailing list