<div dir="ltr"><div dir="ltr"><div>From 812e7949e643ac7763d80f326c5cb6c0682d2c6a Mon Sep 17 00:00:00 2001</div><div>From: Keshav E <<a href="mailto:keshav@multicorewareinc.com">keshav@multicorewareinc.com</a>></div><div>Date: Fri, 14 Oct 2022 13:48:36 +0530</div><div>Subject: [PATCH] Add x86 ASM implementation for subsampling luma</div><div><br></div><div>---</div><div> source/common/picyuv.cpp             |   4 +-</div><div> source/common/pixel.cpp              |  19 ++</div><div> source/common/primitives.h           |   4 +</div><div> source/common/temporalfilter.cpp     |  29 ---</div><div> source/common/temporalfilter.h       |   2 -</div><div> source/common/x86/asm-primitives.cpp |  14 ++</div><div> source/common/x86/mc-a2.asm          | 256 +++++++++++++++++++++++++++</div><div> source/common/x86/mc.h               |  11 ++</div><div> source/encoder/encoder.cpp           |  12 +-</div><div> source/test/pixelharness.cpp         |  41 +++++</div><div> source/test/pixelharness.h           |   1 +</div><div> 11 files changed, 356 insertions(+), 37 deletions(-)</div><div><br></div><div>diff --git a/source/common/picyuv.cpp b/source/common/picyuv.cpp</div><div>index 2855356b5..58426a613 100644</div><div>--- a/source/common/picyuv.cpp</div><div>+++ b/source/common/picyuv.cpp</div><div>@@ -162,8 +162,8 @@ bool PicYuv::createScaledPicYUV(x265_param* param, uint8_t scaleFactor)</div><div>     uint32_t numCuInWidth = (m_picWidth + param->maxCUSize - 1) / param->maxCUSize;</div><div>     uint32_t numCuInHeight = (m_picHeight + param->maxCUSize - 1) / param->maxCUSize;</div><div> </div><div>-    m_lumaMarginX = 32; // search margin for L0 and L1 ME in horizontal direction</div><div>-    m_lumaMarginY = 32; // search margin for L0 and L1 ME in vertical direction</div><div>+    m_lumaMarginX = 128; // search margin for L0 and L1 ME in horizontal direction</div><div>+    m_lumaMarginY = 128; // search margin for L0 and L1 ME in vertical direction</div><div>     m_stride = (numCuInWidth * param->maxCUSize) + (m_lumaMarginX << 1);</div><div> </div><div>     int maxHeight = numCuInHeight * param->maxCUSize;</div><div>diff --git a/source/common/pixel.cpp b/source/common/pixel.cpp</div><div>index 7a40af668..3cd074cfa 100644</div><div>--- a/source/common/pixel.cpp</div><div>+++ b/source/common/pixel.cpp</div><div>@@ -627,6 +627,23 @@ void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel*</div><div>     }</div><div> }</div><div> </div><div>+static</div><div>+void frame_subsample_luma(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height)</div><div>+{</div><div>+    for (int y = 0; y < height; y++, src0 += 2 * src_stride, dst0 += dst_stride)</div><div>+    {</div><div>+        const pixel *inRow = src0;</div><div>+        const pixel *inRowBelow = src0 + src_stride;</div><div>+        pixel *target = dst0;</div><div>+        for (int x = 0; x < width; x++)</div><div>+        {</div><div>+            target[x] = (((inRow[0] + inRowBelow[0] + 1) >> 1) + ((inRow[1] + inRowBelow[1] + 1) >> 1) + 1) >> 1;</div><div>+            inRow += 2;</div><div>+            inRowBelow += 2;</div><div>+        }</div><div>+    }</div><div>+}</div><div>+</div><div> /* structural similarity metric */</div><div> static void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4])</div><div> {</div><div>@@ -1355,5 +1372,7 @@ void setupPixelPrimitives_c(EncoderPrimitives &p)</div><div>     <a href="http://p.cu">p.cu</a>[BLOCK_16x16].normFact = normFact_c;</div><div>     <a href="http://p.cu">p.cu</a>[BLOCK_32x32].normFact = normFact_c;</div><div>     <a href="http://p.cu">p.cu</a>[BLOCK_64x64].normFact = normFact_c;</div><div>+    /* SubSample Luma*/</div><div>+    p.frameSubSampleLuma = frame_subsample_luma;</div><div> }</div><div> }</div><div>diff --git a/source/common/primitives.h b/source/common/primitives.h</div><div>index 7edfc315f..df1cae4b7 100644</div><div>--- a/source/common/primitives.h</div><div>+++ b/source/common/primitives.h</div><div>@@ -232,6 +232,8 @@ typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int</div><div> typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);</div><div> typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const pixel *recon,  intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k);</div><div> typedef void(*normFactor_t)(const pixel *src, uint32_t blockSize, int shift, uint64_t *z_k);</div><div>+/* SubSampling Luma */</div><div>+typedef void (*downscaleluma_t)(const pixel* src0, pixel* dstf, intptr_t src_stride, intptr_t dst_stride, int width, int height);</div><div> /* Function pointers to optimized encoder primitives. Each pointer can reference</div><div>  * either an assembly routine, a SIMD intrinsic primitive, or a C function */</div><div> struct EncoderPrimitives</div><div>@@ -353,6 +355,8 @@ struct EncoderPrimitives</div><div> </div><div>     downscale_t           frameInitLowres;</div><div>     downscale_t           frameInitLowerRes;</div><div>+    /* Sub Sample Luma */</div><div>+    downscaleluma_t        frameSubSampleLuma;</div><div>     cutree_propagate_cost propagateCost;</div><div>     cutree_fix8_unpack    fix8Unpack;</div><div>     cutree_fix8_pack      fix8Pack;</div><div>diff --git a/source/common/temporalfilter.cpp b/source/common/temporalfilter.cpp</div><div>index 72edaaac1..d178c5800 100644</div><div>--- a/source/common/temporalfilter.cpp</div><div>+++ b/source/common/temporalfilter.cpp</div><div>@@ -1213,35 +1213,6 @@ void TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, P</div><div> }</div><div> #endif</div><div> </div><div>-void TemporalFilter::subsampleLuma(PicYuv *input, PicYuv *output, int factor)</div><div>-{</div><div>-</div><div>-    int newWidth = output->m_picWidth;</div><div>-    int newHeight = output->m_picHeight;</div><div>-</div><div>-    pixel* srcRow = input->m_picOrg[0];</div><div>-    intptr_t srcStride = input->m_stride;</div><div>-</div><div>-    pixel *dstRow = output->m_picOrg[0];</div><div>-    intptr_t dstStride = output->m_stride;</div><div>-</div><div>-    for (int y = 0; y < newHeight; y++, srcRow += factor * srcStride, dstRow += dstStride)</div><div>-    {</div><div>-        pixel *inRow = srcRow;</div><div>-        pixel *inRowBelow = srcRow + srcStride;</div><div>-        pixel *target = dstRow;</div><div>-</div><div>-        for (int x = 0; x < newWidth; x++)</div><div>-        {</div><div>-            target[x] = (inRow[0] + inRowBelow[0] + inRow[1] + inRowBelow[1] + 2) >> 2;</div><div>-            inRow += 2;</div><div>-            inRowBelow += 2;</div><div>-        }</div><div>-    }</div><div>-</div><div>-    extendPicBorder(output->m_picOrg[0], output->m_stride, output->m_picWidth, output->m_picHeight, output->m_lumaMarginX, output->m_lumaMarginY);</div><div>-}</div><div>-</div><div> void TemporalFilter::destroyRefPicInfo(TemporalFilterRefPicInfo* curFrame)</div><div> {</div><div>     if (curFrame)</div><div>diff --git a/source/common/temporalfilter.h b/source/common/temporalfilter.h</div><div>index f49fae3c5..55ae6736d 100644</div><div>--- a/source/common/temporalfilter.h</div><div>+++ b/source/common/temporalfilter.h</div><div>@@ -162,8 +162,6 @@ public:</div><div>     Yuv  predPUYuv;</div><div>     int m_useSADinME;</div><div> </div><div>-    void subsampleLuma(PicYuv *input, PicYuv *output, int factor = 2);</div><div>-</div><div>     int createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param* param);</div><div> </div><div>     void bilateralFilter(Frame* frame, TemporalFilterRefPicInfo* mctfRefList, double overallStrength);</div><div>diff --git a/source/common/x86/asm-primitives.cpp b/source/common/x86/asm-primitives.cpp</div><div>index 5cad50f07..bb32f32cd 100644</div><div>--- a/source/common/x86/asm-primitives.cpp</div><div>+++ b/source/common/x86/asm-primitives.cpp</div><div>@@ -1091,6 +1091,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main10</div><div> </div><div>         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);</div><div>         p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);</div><div>+        p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);</div><div>         // TODO: the planecopy_sp is really planecopy_SC now, must be fix it </div><div>         //p.planecopy_sp = PFX(downShift_16_sse2);</div><div>         p.planecopy_sp_shl = PFX(upShift_16_sse2);</div><div>@@ -1121,6 +1122,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main10</div><div>     {</div><div>         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);</div><div>         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);</div><div>+        p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);</div><div> </div><div>         // p.pu[LUMA_4x4].satd = <a href="http://p.cu">p.cu</a>[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken</div><div>         ALL_LUMA_PU(satd, pixel_satd, ssse3);</div><div>@@ -1462,6 +1464,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main10</div><div>         p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);</div><div>         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);</div><div>         p.propagateCost = PFX(mbtree_propagate_cost_avx);</div><div>+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);</div><div>     }</div><div>     if (cpuMask & X265_CPU_XOP)</div><div>     {</div><div>@@ -1473,6 +1476,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main10</div><div>         LUMA_VAR(xop);</div><div>         p.frameInitLowres = PFX(frame_init_lowres_core_xop);</div><div>         p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);</div><div>+        p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);</div><div>     }</div><div>     if (cpuMask & X265_CPU_AVX2)</div><div>     {</div><div>@@ -2301,6 +2305,9 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main10</div><div> </div><div>         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);</div><div>         p.frameInitLowerRes = PFX(frame_init_lowres_core_avx2);</div><div>+</div><div>+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);</div><div>+</div><div>         p.propagateCost = PFX(mbtree_propagate_cost_avx2);</div><div>         p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);</div><div>         p.fix8Pack = PFX(cutree_fix8_pack_avx2);</div><div>@@ -3300,6 +3307,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main</div><div>         //p.frameInitLowres = PFX(frame_init_lowres_core_mmx2);</div><div>         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);</div><div>         p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);</div><div>+        p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);</div><div> </div><div>         ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, sse2);</div><div>         ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);</div><div>@@ -3424,6 +3432,8 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main</div><div>         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);</div><div>         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);</div><div> </div><div>+        p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);</div><div>+</div><div>         ASSIGN2(p.pu[LUMA_8x4].convert_p2s, filterPixelToShort_8x4_ssse3);</div><div>         ASSIGN2(p.pu[LUMA_8x8].convert_p2s, filterPixelToShort_8x8_ssse3);</div><div>         ASSIGN2(p.pu[LUMA_8x16].convert_p2s, filterPixelToShort_8x16_ssse3);</div><div>@@ -3691,6 +3701,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main</div><div>         p.frameInitLowres = PFX(frame_init_lowres_core_avx);</div><div>         p.frameInitLowerRes = PFX(frame_init_lowres_core_avx);</div><div>         p.propagateCost = PFX(mbtree_propagate_cost_avx);</div><div>+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);</div><div>     }</div><div>     if (cpuMask & X265_CPU_XOP)</div><div>     {</div><div>@@ -3702,6 +3713,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main</div><div>         <a href="http://p.cu">p.cu</a>[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_xop);</div><div>         p.frameInitLowres = PFX(frame_init_lowres_core_xop);</div><div>         p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);</div><div>+        p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);</div><div> </div><div>     }</div><div> #if X86_64</div><div>@@ -4684,6 +4696,8 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main</div><div>         p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);</div><div>         p.saoCuStatsE3 = PFX(saoCuStatsE3_avx2);</div><div> </div><div>+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);</div><div>+</div><div>         if (cpuMask & X265_CPU_BMI2)</div><div>         {</div><div>             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);</div><div>diff --git a/source/common/x86/mc-a2.asm b/source/common/x86/mc-a2.asm</div><div>index fc36b2cc8..94a995ee0 100644</div><div>--- a/source/common/x86/mc-a2.asm</div><div>+++ b/source/common/x86/mc-a2.asm</div><div>@@ -992,6 +992,262 @@ INIT_YMM avx2</div><div> FRAME_INIT_LOWRES</div><div> %endif</div><div> </div><div>+%macro SUBSAMPLEFILT8x4 7</div><div>+    mova      %3, [r0+%7]</div><div>+    mova      %4, [r0+r2+%7]</div><div>+    pavgb     %3, %4</div><div>+    pavgb     %4, [r0+r2*2+%7]</div><div>+    PALIGNR   %1, %3, 1, m6</div><div>+    PALIGNR   %2, %4, 1, m6</div><div>+%if cpuflag(xop)</div><div>+    pavgb     %1, %3</div><div>+    pavgb     %2, %4</div><div>+%else</div><div>+    pavgb     %1, %3</div><div>+    pavgb     %2, %4</div><div>+    psrlw     %5, %1, 8</div><div>+    psrlw     %6, %2, 8</div><div>+    pand      %1, m7</div><div>+    pand      %2, m7</div><div>+%endif</div><div>+%endmacro</div><div>+</div><div>+%macro SUBSAMPLEFILT32x4U 1</div><div>+    movu      m1, [r0+r2]</div><div>+    pavgb     m0, m1, [r0]</div><div>+    movu      m3, [r0+r2+1]</div><div>+    pavgb     m2, m3, [r0+1]</div><div>+    pavgb     m1, [r0+r2*2]</div><div>+    pavgb     m3, [r0+r2*2+1]</div><div>+    pavgb     m0, m2</div><div>+    pavgb     m1, m3</div><div>+</div><div>+    movu      m3, [r0+r2+mmsize]</div><div>+    pavgb     m2, m3, [r0+mmsize]</div><div>+    movu      m5, [r0+r2+1+mmsize]</div><div>+    pavgb     m4, m5, [r0+1+mmsize]</div><div>+    pavgb     m2, m4</div><div>+</div><div>+    pshufb    m0, m7</div><div>+    pshufb    m2, m7</div><div>+    punpcklqdq m0, m0, m2</div><div>+    vpermq    m0, m0, q3120</div><div>+    movu    [%1], m0</div><div>+%endmacro</div><div>+</div><div>+%macro SUBSAMPLEFILT16x2 3</div><div>+    mova      m3, [r0+%3+mmsize]</div><div>+    mova      m2, [r0+%3]</div><div>+    pavgb     m3, [r0+%3+r2+mmsize]</div><div>+    pavgb     m2, [r0+%3+r2]</div><div>+    PALIGNR   %1, m3, 1, m6</div><div>+    pavgb     %1, m3</div><div>+    PALIGNR   m3, m2, 1, m6</div><div>+    pavgb     m3, m2</div><div>+%if cpuflag(xop)</div><div>+    vpperm    m3, m3, %1, m6</div><div>+%else</div><div>+    pand      m3, m7</div><div>+    pand      %1, m7</div><div>+    packuswb  m3, %1</div><div>+%endif</div><div>+    mova    [%2], m3</div><div>+    mova      %1, m2</div><div>+%endmacro</div><div>+</div><div>+%macro SUBSAMPLEFILT8x2U 2</div><div>+    mova      m2, [r0+%2]</div><div>+    pavgb     m2, [r0+%2+r2]</div><div>+    mova      m0, [r0+%2+1]</div><div>+    pavgb     m0, [r0+%2+r2+1]</div><div>+    pavgb     m1, m3</div><div>+    pavgb     m0, m2</div><div>+    pand      m1, m7</div><div>+    pand      m0, m7</div><div>+    packuswb  m0, m1</div><div>+    mova    [%1], m0</div><div>+%endmacro</div><div>+</div><div>+%macro SUBSAMPLEFILT8xU 2</div><div>+    mova      m3, [r0+%2+8]</div><div>+    mova      m2, [r0+%2]</div><div>+    pavgw     m3, [r0+%2+r2+8]</div><div>+    pavgw     m2, [r0+%2+r2]</div><div>+    movu      m1, [r0+%2+10]</div><div>+    movu      m0, [r0+%2+2]</div><div>+    pavgw     m1, [r0+%2+r2+10]</div><div>+    pavgw     m0, [r0+%2+r2+2]</div><div>+    pavgw     m1, m3</div><div>+    pavgw     m0, m2</div><div>+    psrld     m3, m1, 16</div><div>+    pand      m1, m7</div><div>+    pand      m0, m7</div><div>+    packssdw  m0, m1</div><div>+    movu    [%1], m0</div><div>+%endmacro</div><div>+</div><div>+%macro SUBSAMPLEFILT8xA 3</div><div>+    movu      m3, [r0+%3+mmsize]</div><div>+    movu      m2, [r0+%3]</div><div>+    pavgw     m3, [r0+%3+r2+mmsize]</div><div>+    pavgw     m2, [r0+%3+r2]</div><div>+    PALIGNR   %1, m3, 2, m6</div><div>+    pavgw     %1, m3</div><div>+    PALIGNR   m3, m2, 2, m6</div><div>+    pavgw     m3, m2</div><div>+%if cpuflag(xop)</div><div>+    vpperm    m3, m3, %1, m6</div><div>+%else</div><div>+    pand      m3, m7</div><div>+    pand      %1, m7</div><div>+    packssdw  m3, %1</div><div>+%endif</div><div>+%if cpuflag(avx2)</div><div>+    vpermq     m3, m3, q3120</div><div>+%endif</div><div>+    movu    [%2], m3</div><div>+    movu      %1, m2</div><div>+%endmacro</div><div>+</div><div>+;-----------------------------------------------------------------------------</div><div>+; void frame_subsample_luma( uint8_t *src0, uint8_t *dst0,</div><div>+;                              intptr_t src_stride, intptr_t dst_stride, int width, int height )</div><div>+;-----------------------------------------------------------------------------</div><div>+</div><div>+%macro FRAME_SUBSAMPLE_LUMA 0</div><div>+cglobal frame_subsample_luma, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise</div><div>+%if HIGH_BIT_DEPTH</div><div>+    shl   dword r3m, 1</div><div>+    FIX_STRIDES r2</div><div>+    shl   dword r4m, 1</div><div>+%endif</div><div>+%if mmsize >= 16</div><div>+    add   dword r4m, mmsize-1</div><div>+    and   dword r4m, ~(mmsize-1)</div><div>+%endif</div><div>+    ; src += 2*(height-1)*stride + 2*width</div><div>+    mov      r6d, r5m</div><div>+    dec      r6d</div><div>+    imul     r6d, r2d</div><div>+    add      r6d, r4m</div><div>+    lea       r0, [r0+r6*2]</div><div>+    ; dst += (height-1)*stride + width</div><div>+    mov      r6d, r5m</div><div>+    dec      r6d</div><div>+    imul     r6d, r3m</div><div>+    add      r6d, r4m</div><div>+    add       r1, r6</div><div>+    ; gap = stride - width</div><div>+    mov      r6d, r3m</div><div>+    sub      r6d, r4m</div><div>+    PUSH      r6</div><div>+    %define dst_gap [rsp+gprsize]</div><div>+    mov      r6d, r2d</div><div>+    sub      r6d, r4m</div><div>+    shl      r6d, 1</div><div>+    PUSH      r6</div><div>+    %define src_gap [rsp]</div><div>+%if HIGH_BIT_DEPTH</div><div>+%if cpuflag(xop)</div><div>+    mova      m6, [deinterleave_shuf32a]</div><div>+    mova      m7, [deinterleave_shuf32b]</div><div>+%else</div><div>+    pcmpeqw   m7, m7</div><div>+    psrld     m7, 16</div><div>+%endif</div><div>+.vloop:</div><div>+    mov      r6d, r4m</div><div>+%ifnidn cpuname, mmx2</div><div>+    movu      m0, [r0]</div><div>+    movu      m1, [r0+r2]</div><div>+    pavgw     m0, m1</div><div>+    pavgw     m1, [r0+r2*2]</div><div>+%endif</div><div>+.hloop:</div><div>+    sub       r0, mmsize*2</div><div>+    sub       r1, mmsize</div><div>+%ifidn cpuname, mmx2</div><div>+    SUBSAMPLEFILT8xU r1, 0</div><div>+%else</div><div>+    SUBSAMPLEFILT8xA m0, r1, 0</div><div>+%endif</div><div>+    sub      r6d, mmsize</div><div>+    jg .hloop</div><div>+%else ; !HIGH_BIT_DEPTH</div><div>+%if cpuflag(avx2)</div><div>+    mova      m7, [deinterleave_shuf]</div><div>+%elif cpuflag(xop)</div><div>+    mova      m6, [deinterleave_shuf32a]</div><div>+    mova      m7, [deinterleave_shuf32b]</div><div>+%else</div><div>+    pcmpeqb   m7, m7</div><div>+    psrlw     m7, 8</div><div>+%endif</div><div>+.vloop:</div><div>+    mov      r6d, r4m</div><div>+%ifnidn cpuname, mmx2</div><div>+%if mmsize <= 16</div><div>+    mova      m0, [r0]</div><div>+    mova      m1, [r0+r2]</div><div>+    pavgb     m0, m1</div><div>+    pavgb     m1, [r0+r2*2]</div><div>+%endif</div><div>+%endif</div><div>+.hloop:</div><div>+    sub       r0, mmsize*2</div><div>+    sub       r1, mmsize</div><div>+%if mmsize==32</div><div>+    SUBSAMPLEFILT32x4U r1</div><div>+%elifdef m8</div><div>+    SUBSAMPLEFILT8x4   m0, m1, m2, m3, m10, m11, mmsize</div><div>+    mova      m8, m0</div><div>+    mova      m9, m1</div><div>+    SUBSAMPLEFILT8x4   m2, m3, m0, m1, m4, m5, 0</div><div>+%if cpuflag(xop)</div><div>+    vpperm    m4, m2, m8, m7</div><div>+    vpperm    m2, m2, m8, m6</div><div>+%else</div><div>+    packuswb  m2, m8</div><div>+%endif</div><div>+    mova    [r1], m2</div><div>+%elifidn cpuname, mmx2</div><div>+    SUBSAMPLEFILT8x2U  r1, 0</div><div>+%else</div><div>+    SUBSAMPLEFILT16x2  m0, r1, 0</div><div>+%endif</div><div>+    sub      r6d, mmsize</div><div>+    jg .hloop</div><div>+%endif ; HIGH_BIT_DEPTH</div><div>+.skip:</div><div>+    mov       r3, dst_gap</div><div>+    sub       r0, src_gap</div><div>+    sub       r1, r3</div><div>+    dec    dword r5m</div><div>+    jg .vloop</div><div>+    ADD      rsp, 2*gprsize</div><div>+    emms</div><div>+    RET</div><div>+%endmacro ; FRAME_SUBSAMPLE_LUMA</div><div>+</div><div>+INIT_MMX mmx2</div><div>+FRAME_SUBSAMPLE_LUMA</div><div>+%if ARCH_X86_64 == 0</div><div>+INIT_MMX cache32, mmx2</div><div>+FRAME_SUBSAMPLE_LUMA</div><div>+%endif</div><div>+INIT_XMM sse2</div><div>+FRAME_SUBSAMPLE_LUMA</div><div>+INIT_XMM ssse3</div><div>+FRAME_SUBSAMPLE_LUMA</div><div>+INIT_XMM avx</div><div>+FRAME_SUBSAMPLE_LUMA</div><div>+INIT_XMM xop</div><div>+FRAME_SUBSAMPLE_LUMA</div><div>+%if ARCH_X86_64 == 1</div><div>+INIT_YMM avx2</div><div>+FRAME_SUBSAMPLE_LUMA</div><div>+%endif</div><div>+</div><div> ;-----------------------------------------------------------------------------</div><div> ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,</div><div> ;                             uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )</div><div>diff --git a/source/common/x86/mc.h b/source/common/x86/mc.h</div><div>index 83b97a469..8a0564c14 100644</div><div>--- a/source/common/x86/mc.h</div><div>+++ b/source/common/x86/mc.h</div><div>@@ -36,6 +36,17 @@ LOWRES(xop)</div><div> </div><div> #undef LOWRES</div><div> </div><div>+#define SUBSAMPLELUMA(cpu) \</div><div>+    void PFX(frame_subsample_luma_ ## cpu)(const pixel* src0, pixel* dst0, intptr_t src_stride, intptr_t dst_stride, int width, int height);</div><div>+SUBSAMPLELUMA(mmx2)</div><div>+SUBSAMPLELUMA(sse2)</div><div>+SUBSAMPLELUMA(ssse3)</div><div>+SUBSAMPLELUMA(avx)</div><div>+SUBSAMPLELUMA(avx2)</div><div>+SUBSAMPLELUMA(xop)</div><div>+</div><div>+#undef SUBSAMPLELUMA</div><div>+</div><div> #define PROPAGATE_COST(cpu) \</div><div>     void PFX(mbtree_propagate_cost_ ## cpu)(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, \</div><div>                                               const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);</div><div>diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp</div><div>index 47a9913bc..c24507e5a 100644</div><div>--- a/source/encoder/encoder.cpp</div><div>+++ b/source/encoder/encoder.cpp</div><div>@@ -2703,8 +2703,10 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)</div><div> </div><div>                 if (!*frameEnc->m_isSubSampled)</div><div>                 {</div><div>-                    curEncoder->m_frameEncTF->subsampleLuma(frameEnc->m_fencPic, frameEnc->m_fencPicSubsampled2);</div><div>-                    curEncoder->m_frameEncTF->subsampleLuma(frameEnc->m_fencPicSubsampled2, frameEnc->m_fencPicSubsampled4);</div><div>+                    primitives.frameSubSampleLuma((const pixel *)frameEnc->m_fencPic->m_picOrg[0],frameEnc->m_fencPicSubsampled2->m_picOrg[0], frameEnc->m_fencPic->m_stride, frameEnc->m_fencPicSubsampled2->m_stride, frameEnc->m_fencPicSubsampled2->m_picWidth, frameEnc->m_fencPicSubsampled2->m_picHeight);</div><div>+                    extendPicBorder(frameEnc->m_fencPicSubsampled2->m_picOrg[0], frameEnc->m_fencPicSubsampled2->m_stride, frameEnc->m_fencPicSubsampled2->m_picWidth, frameEnc->m_fencPicSubsampled2->m_picHeight, frameEnc->m_fencPicSubsampled2->m_lumaMarginX, frameEnc->m_fencPicSubsampled2->m_lumaMarginY);</div><div>+                    primitives.frameSubSampleLuma((const pixel *)frameEnc->m_fencPicSubsampled2->m_picOrg[0],frameEnc->m_fencPicSubsampled4->m_picOrg[0], frameEnc->m_fencPicSubsampled2->m_stride, frameEnc->m_fencPicSubsampled4->m_stride, frameEnc->m_fencPicSubsampled4->m_picWidth, frameEnc->m_fencPicSubsampled4->m_picHeight);</div><div>+                    extendPicBorder(frameEnc->m_fencPicSubsampled4->m_picOrg[0], frameEnc->m_fencPicSubsampled4->m_stride, frameEnc->m_fencPicSubsampled4->m_picWidth, frameEnc->m_fencPicSubsampled4->m_picHeight, frameEnc->m_fencPicSubsampled4->m_lumaMarginX, frameEnc->m_fencPicSubsampled4->m_lumaMarginY);</div><div>                     *frameEnc->m_isSubSampled = true;</div><div>                 }</div><div> </div><div>@@ -2713,8 +2715,10 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out)</div><div>                     TemporalFilterRefPicInfo *ref = &curEncoder->m_mcstfRefList[i - 1];</div><div>                     if (!*ref->isSubsampled)</div><div>                     {</div><div>-                        curEncoder->m_frameEncTF->subsampleLuma(ref->picBuffer, ref->picBufferSubSampled2);</div><div>-                        curEncoder->m_frameEncTF->subsampleLuma(ref->picBufferSubSampled2, ref->picBufferSubSampled4);</div><div>+                        primitives.frameSubSampleLuma((const pixel *)ref->picBuffer->m_picOrg[0], ref->picBufferSubSampled2->m_picOrg[0], ref->picBuffer->m_stride, ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled2->m_picWidth, ref->picBufferSubSampled2->m_picHeight);</div><div>+                        extendPicBorder(frameEnc->m_fencPicSubsampled2->m_picOrg[0], ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled2->m_picWidth, ref->picBufferSubSampled2->m_picHeight, ref->picBufferSubSampled2->m_lumaMarginX, ref->picBufferSubSampled2->m_lumaMarginY);</div><div>+                        primitives.frameSubSampleLuma((const pixel *)ref->picBufferSubSampled2->m_picOrg[0],ref->picBufferSubSampled4->m_picOrg[0], ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled4->m_stride, ref->picBufferSubSampled4->m_picWidth, ref->picBufferSubSampled4->m_picHeight);</div><div>+                        extendPicBorder(ref->picBufferSubSampled4->m_picOrg[0], ref->picBufferSubSampled4->m_stride, ref->picBufferSubSampled4->m_picWidth, ref->picBufferSubSampled4->m_picHeight, ref->picBufferSubSampled4->m_lumaMarginX, ref->picBufferSubSampled4->m_lumaMarginY);</div><div>                         *ref->isSubsampled = true;</div><div>                     }</div><div>                 }</div><div>diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp</div><div>index 6e0af1229..550521666 100644</div><div>--- a/source/test/pixelharness.cpp</div><div>+++ b/source/test/pixelharness.cpp</div><div>@@ -406,6 +406,32 @@ bool PixelHarness::check_downscale_t(downscale_t ref, downscale_t opt)</div><div>     return true;</div><div> }</div><div> </div><div>+bool PixelHarness::check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt)</div><div>+{</div><div>+    ALIGN_VAR_16(pixel, ref_destf[32 * 32]);</div><div>+    ALIGN_VAR_16(pixel, opt_destf[32 * 32]);</div><div>+</div><div>+    intptr_t src_stride = 64;</div><div>+    intptr_t dst_stride = 32;</div><div>+    int bx = 32;</div><div>+    int by = 32;</div><div>+    int j = 0;</div><div>+    for (int i = 0; i < ITERS; i++)</div><div>+    {</div><div>+        int index = i % TEST_CASES;</div><div>+        ref(pixel_test_buff[index] + j, ref_destf, src_stride, dst_stride, bx, by);</div><div>+        checked(opt, pixel_test_buff[index] + j, opt_destf, src_stride, dst_stride, bx, by);</div><div>+</div><div>+        if (memcmp(ref_destf, opt_destf, 32 * 32 * sizeof(pixel)))</div><div>+            return false;</div><div>+</div><div>+        reportfail();</div><div>+        j += INCR;</div><div>+    }</div><div>+</div><div>+    return true;</div><div>+}</div><div>+</div><div> bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt)</div><div> {</div><div>     ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);</div><div>@@ -2793,6 +2819,15 @@ bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr</div><div>         }</div><div>     }</div><div> </div><div>+    if (opt.frameSubSampleLuma)</div><div>+    {</div><div>+        if (!check_downscaleluma_t(ref.frameSubSampleLuma, opt.frameSubSampleLuma))</div><div>+        {</div><div>+            printf("SubSample Luma failed!\n");</div><div>+            return false;</div><div>+        }</div><div>+    }</div><div>+</div><div>     if (opt.scale1D_128to64[NONALIGNED])</div><div>     {</div><div>         if (!check_scale1D_pp(ref.scale1D_128to64[NONALIGNED], opt.scale1D_128to64[NONALIGNED]))</div><div>@@ -3492,6 +3527,12 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi</div><div>         REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);</div><div>     }</div><div> </div><div>+    if (opt.frameSubSampleLuma)</div><div>+    {</div><div>+        HEADER0("downscaleluma");</div><div>+        REPORT_SPEEDUP(opt.frameSubSampleLuma, ref.frameSubSampleLuma, pbuf2, pbuf1, 64, 64, 64, 64);</div><div>+    }</div><div>+</div><div>     if (opt.scale1D_128to64[NONALIGNED])</div><div>     {</div><div>         HEADER0("scale1D_128to64");</div><div>diff --git a/source/test/pixelharness.h b/source/test/pixelharness.h</div><div>index bf29d36a2..ee43cbeae 100644</div><div>--- a/source/test/pixelharness.h</div><div>+++ b/source/test/pixelharness.h</div><div>@@ -138,6 +138,7 @@ protected:</div><div>     bool check_integral_inith(integralh_t ref, integralh_t opt);</div><div>     bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);</div><div>     bool check_normFact(normFactor_t ref, normFactor_t opt, int block);</div><div>+    bool check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt);</div><div> </div><div> public:</div><div> </div><div>-- </div><div>2.36.0.windows.1</div><div><br></div><div><div dir="ltr" class="gmail_signature"><div dir="ltr"><div><i><font face="georgia, serif">Thanks and Regards,</font></i></div><div><i><font face="georgia, serif"><b>Snehaa.G</b><br>Video Codec Engineer,<br>Media & AI analytics<br><a href="https://multicorewareinc.com/" target="_blank"><img src="https://ci3.googleusercontent.com/mail-sig/AIorK4yEumXeQ2mgcFAR2us9INa7z3rCbl8ordut3fbdeIbuPv0n3EA75Or1rHs0neGaI0WM8mFPz1g"></a><br><span></span><span></span><br></font></i></div></div></div></div></div></div>