[x265] [PATCH 13/14] Add x86 ASM implementation for subsampling luma

Wed Oct 19 07:32:19 UTC 2022

>From 812e7949e643ac7763d80f326c5cb6c0682d2c6a Mon Sep 17 00:00:00 2001
From: Keshav E <keshav at multicorewareinc.com>
Date: Fri, 14 Oct 2022 13:48:36 +0530
Subject: [PATCH] Add x86 ASM implementation for subsampling luma

---
 source/common/picyuv.cpp             |   4 +-
 source/common/pixel.cpp              |  19 ++
 source/common/primitives.h           |   4 +
 source/common/temporalfilter.cpp     |  29 ---
 source/common/temporalfilter.h       |   2 -
 source/common/x86/asm-primitives.cpp |  14 ++
 source/common/x86/mc-a2.asm          | 256 +++++++++++++++++++++++++++
 source/common/x86/mc.h               |  11 ++
 source/encoder/encoder.cpp           |  12 +-
 source/test/pixelharness.cpp         |  41 +++++
 source/test/pixelharness.h           |   1 +
 11 files changed, 356 insertions(+), 37 deletions(-)

diff --git a/source/common/picyuv.cpp b/source/common/picyuv.cpp
index 2855356b5..58426a613 100644
--- a/source/common/picyuv.cpp
+++ b/source/common/picyuv.cpp
@@ -162,8 +162,8 @@ bool PicYuv::createScaledPicYUV(x265_param* param,
uint8_t scaleFactor)
     uint32_t numCuInWidth = (m_picWidth + param->maxCUSize - 1) /
param->maxCUSize;
     uint32_t numCuInHeight = (m_picHeight + param->maxCUSize - 1) /
param->maxCUSize;

-    m_lumaMarginX = 32; // search margin for L0 and L1 ME in horizontal
direction
-    m_lumaMarginY = 32; // search margin for L0 and L1 ME in vertical
direction
+    m_lumaMarginX = 128; // search margin for L0 and L1 ME in horizontal
direction
+    m_lumaMarginY = 128; // search margin for L0 and L1 ME in vertical
direction
     m_stride = (numCuInWidth * param->maxCUSize) + (m_lumaMarginX << 1);

     int maxHeight = numCuInHeight * param->maxCUSize;
diff --git a/source/common/pixel.cpp b/source/common/pixel.cpp
index 7a40af668..3cd074cfa 100644
--- a/source/common/pixel.cpp
+++ b/source/common/pixel.cpp
@@ -627,6 +627,23 @@ void frame_init_lowres_core(const pixel* src0, pixel*
dst0, pixel* dsth, pixel*
     }
 }

+static
+void frame_subsample_luma(const pixel* src0, pixel* dst0, intptr_t
src_stride, intptr_t dst_stride, int width, int height)
+{
+    for (int y = 0; y < height; y++, src0 += 2 * src_stride, dst0 +=
dst_stride)
+    {
+        const pixel *inRow = src0;
+        const pixel *inRowBelow = src0 + src_stride;
+        pixel *target = dst0;
+        for (int x = 0; x < width; x++)
+        {
+            target[x] = (((inRow[0] + inRowBelow[0] + 1) >> 1) +
((inRow[1] + inRowBelow[1] + 1) >> 1) + 1) >> 1;
+            inRow += 2;
+            inRowBelow += 2;
+        }
+    }
+}
+
 /* structural similarity metric */
 static void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const
pixel* pix2, intptr_t stride2, int sums[2][4])
 {
@@ -1355,5 +1372,7 @@ void setupPixelPrimitives_c(EncoderPrimitives &p)
     p.cu[BLOCK_16x16].normFact = normFact_c;
     p.cu[BLOCK_32x32].normFact = normFact_c;
     p.cu[BLOCK_64x64].normFact = normFact_c;
+    /* SubSample Luma*/
+    p.frameSubSampleLuma = frame_subsample_luma;
 }
 }
diff --git a/source/common/primitives.h b/source/common/primitives.h
index 7edfc315f..df1cae4b7 100644
--- a/source/common/primitives.h
+++ b/source/common/primitives.h
@@ -232,6 +232,8 @@ typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff,
int64_t *costUncoded, int
 typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t
*m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t
*totalRdCost, int64_t *psyScale, uint32_t blkPos);
 typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const
pixel *recon,  intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t
*ac_k);
 typedef void(*normFactor_t)(const pixel *src, uint32_t blockSize, int
shift, uint64_t *z_k);
+/* SubSampling Luma */
+typedef void (*downscaleluma_t)(const pixel* src0, pixel* dstf, intptr_t
src_stride, intptr_t dst_stride, int width, int height);
 /* Function pointers to optimized encoder primitives. Each pointer can
reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function
*/
 struct EncoderPrimitives
@@ -353,6 +355,8 @@ struct EncoderPrimitives

     downscale_t           frameInitLowres;
     downscale_t           frameInitLowerRes;
+    /* Sub Sample Luma */
+    downscaleluma_t        frameSubSampleLuma;
     cutree_propagate_cost propagateCost;
     cutree_fix8_unpack    fix8Unpack;
     cutree_fix8_pack      fix8Pack;
diff --git a/source/common/temporalfilter.cpp
b/source/common/temporalfilter.cpp
index 72edaaac1..d178c5800 100644
--- a/source/common/temporalfilter.cpp
+++ b/source/common/temporalfilter.cpp
@@ -1213,35 +1213,6 @@ void
TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, P
 }
 #endif

-void TemporalFilter::subsampleLuma(PicYuv *input, PicYuv *output, int
factor)
-{
-
-    int newWidth = output->m_picWidth;
-    int newHeight = output->m_picHeight;
-
-    pixel* srcRow = input->m_picOrg[0];
-    intptr_t srcStride = input->m_stride;
-
-    pixel *dstRow = output->m_picOrg[0];
-    intptr_t dstStride = output->m_stride;
-
-    for (int y = 0; y < newHeight; y++, srcRow += factor * srcStride,
dstRow += dstStride)
-    {
-        pixel *inRow = srcRow;
-        pixel *inRowBelow = srcRow + srcStride;
-        pixel *target = dstRow;
-
-        for (int x = 0; x < newWidth; x++)
-        {
-            target[x] = (inRow[0] + inRowBelow[0] + inRow[1] +
inRowBelow[1] + 2) >> 2;
-            inRow += 2;
-            inRowBelow += 2;
-        }
-    }
-
-    extendPicBorder(output->m_picOrg[0], output->m_stride,
output->m_picWidth, output->m_picHeight, output->m_lumaMarginX,
output->m_lumaMarginY);
-}
-
 void TemporalFilter::destroyRefPicInfo(TemporalFilterRefPicInfo* curFrame)
 {
     if (curFrame)
diff --git a/source/common/temporalfilter.h b/source/common/temporalfilter.h
index f49fae3c5..55ae6736d 100644
--- a/source/common/temporalfilter.h
+++ b/source/common/temporalfilter.h
@@ -162,8 +162,6 @@ public:
     Yuv  predPUYuv;
     int m_useSADinME;

-    void subsampleLuma(PicYuv *input, PicYuv *output, int factor = 2);
-
     int createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param*
param);

     void bilateralFilter(Frame* frame, TemporalFilterRefPicInfo*
mctfRefList, double overallStrength);
diff --git a/source/common/x86/asm-primitives.cpp
b/source/common/x86/asm-primitives.cpp
index 5cad50f07..bb32f32cd 100644
--- a/source/common/x86/asm-primitives.cpp
+++ b/source/common/x86/asm-primitives.cpp
@@ -1091,6 +1091,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main10

         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
         // TODO: the planecopy_sp is really planecopy_SC now, must be fix
it
         //p.planecopy_sp = PFX(downShift_16_sse2);
         p.planecopy_sp_shl = PFX(upShift_16_sse2);
@@ -1121,6 +1122,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main10
     {
         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);

         // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d =
PFX(pixel_satd_4x4_ssse3); this one is broken
         ALL_LUMA_PU(satd, pixel_satd, ssse3);
@@ -1462,6 +1464,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main10
         p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
         p.propagateCost = PFX(mbtree_propagate_cost_avx);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -1473,6 +1476,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main10
         LUMA_VAR(xop);
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
     }
     if (cpuMask & X265_CPU_AVX2)
     {
@@ -2301,6 +2305,9 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main10

         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_avx2);
+
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
+
         p.propagateCost = PFX(mbtree_propagate_cost_avx2);
         p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
         p.fix8Pack = PFX(cutree_fix8_pack_avx2);
@@ -3300,6 +3307,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main
         //p.frameInitLowres = PFX(frame_init_lowres_core_mmx2);
         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);

         ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, sse2);
         ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);
@@ -3424,6 +3432,8 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main
         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);

+        p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
+
         ASSIGN2(p.pu[LUMA_8x4].convert_p2s, filterPixelToShort_8x4_ssse3);
         ASSIGN2(p.pu[LUMA_8x8].convert_p2s, filterPixelToShort_8x8_ssse3);
         ASSIGN2(p.pu[LUMA_8x16].convert_p2s,
filterPixelToShort_8x16_ssse3);
@@ -3691,6 +3701,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main
         p.frameInitLowres = PFX(frame_init_lowres_core_avx);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_avx);
         p.propagateCost = PFX(mbtree_propagate_cost_avx);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -3702,6 +3713,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main
         p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_xop);
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
         p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
+        p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);

     }
 #if X86_64
@@ -4684,6 +4696,8 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main
         p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
         p.saoCuStatsE3 = PFX(saoCuStatsE3_avx2);

+        p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
+
         if (cpuMask & X265_CPU_BMI2)
         {
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
diff --git a/source/common/x86/mc-a2.asm b/source/common/x86/mc-a2.asm
index fc36b2cc8..94a995ee0 100644
--- a/source/common/x86/mc-a2.asm
+++ b/source/common/x86/mc-a2.asm
@@ -992,6 +992,262 @@ INIT_YMM avx2
 FRAME_INIT_LOWRES
 %endif

+%macro SUBSAMPLEFILT8x4 7
+    mova      %3, [r0+%7]
+    mova      %4, [r0+r2+%7]
+    pavgb     %3, %4
+    pavgb     %4, [r0+r2*2+%7]
+    PALIGNR   %1, %3, 1, m6
+    PALIGNR   %2, %4, 1, m6
+%if cpuflag(xop)
+    pavgb     %1, %3
+    pavgb     %2, %4
+%else
+    pavgb     %1, %3
+    pavgb     %2, %4
+    psrlw     %5, %1, 8
+    psrlw     %6, %2, 8
+    pand      %1, m7
+    pand      %2, m7
+%endif
+%endmacro
+
+%macro SUBSAMPLEFILT32x4U 1
+    movu      m1, [r0+r2]
+    pavgb     m0, m1, [r0]
+    movu      m3, [r0+r2+1]
+    pavgb     m2, m3, [r0+1]
+    pavgb     m1, [r0+r2*2]
+    pavgb     m3, [r0+r2*2+1]
+    pavgb     m0, m2
+    pavgb     m1, m3
+
+    movu      m3, [r0+r2+mmsize]
+    pavgb     m2, m3, [r0+mmsize]
+    movu      m5, [r0+r2+1+mmsize]
+    pavgb     m4, m5, [r0+1+mmsize]
+    pavgb     m2, m4
+
+    pshufb    m0, m7
+    pshufb    m2, m7
+    punpcklqdq m0, m0, m2
+    vpermq    m0, m0, q3120
+    movu    [%1], m0
+%endmacro
+
+%macro SUBSAMPLEFILT16x2 3
+    mova      m3, [r0+%3+mmsize]
+    mova      m2, [r0+%3]
+    pavgb     m3, [r0+%3+r2+mmsize]
+    pavgb     m2, [r0+%3+r2]
+    PALIGNR   %1, m3, 1, m6
+    pavgb     %1, m3
+    PALIGNR   m3, m2, 1, m6
+    pavgb     m3, m2
+%if cpuflag(xop)
+    vpperm    m3, m3, %1, m6
+%else
+    pand      m3, m7
+    pand      %1, m7
+    packuswb  m3, %1
+%endif
+    mova    [%2], m3
+    mova      %1, m2
+%endmacro
+
+%macro SUBSAMPLEFILT8x2U 2
+    mova      m2, [r0+%2]
+    pavgb     m2, [r0+%2+r2]
+    mova      m0, [r0+%2+1]
+    pavgb     m0, [r0+%2+r2+1]
+    pavgb     m1, m3
+    pavgb     m0, m2
+    pand      m1, m7
+    pand      m0, m7
+    packuswb  m0, m1
+    mova    [%1], m0
+%endmacro
+
+%macro SUBSAMPLEFILT8xU 2
+    mova      m3, [r0+%2+8]
+    mova      m2, [r0+%2]
+    pavgw     m3, [r0+%2+r2+8]
+    pavgw     m2, [r0+%2+r2]
+    movu      m1, [r0+%2+10]
+    movu      m0, [r0+%2+2]
+    pavgw     m1, [r0+%2+r2+10]
+    pavgw     m0, [r0+%2+r2+2]
+    pavgw     m1, m3
+    pavgw     m0, m2
+    psrld     m3, m1, 16
+    pand      m1, m7
+    pand      m0, m7
+    packssdw  m0, m1
+    movu    [%1], m0
+%endmacro
+
+%macro SUBSAMPLEFILT8xA 3
+    movu      m3, [r0+%3+mmsize]
+    movu      m2, [r0+%3]
+    pavgw     m3, [r0+%3+r2+mmsize]
+    pavgw     m2, [r0+%3+r2]
+    PALIGNR   %1, m3, 2, m6
+    pavgw     %1, m3
+    PALIGNR   m3, m2, 2, m6
+    pavgw     m3, m2
+%if cpuflag(xop)
+    vpperm    m3, m3, %1, m6
+%else
+    pand      m3, m7
+    pand      %1, m7
+    packssdw  m3, %1
+%endif
+%if cpuflag(avx2)
+    vpermq     m3, m3, q3120
+%endif
+    movu    [%2], m3
+    movu      %1, m2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void frame_subsample_luma( uint8_t *src0, uint8_t *dst0,
+;                              intptr_t src_stride, intptr_t dst_stride,
int width, int height )
+;-----------------------------------------------------------------------------
+
+%macro FRAME_SUBSAMPLE_LUMA 0
+cglobal frame_subsample_luma, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for
HIGH_BIT_DEPTH, 12 otherwise
+%if HIGH_BIT_DEPTH
+    shl   dword r3m, 1
+    FIX_STRIDES r2
+    shl   dword r4m, 1
+%endif
+%if mmsize >= 16
+    add   dword r4m, mmsize-1
+    and   dword r4m, ~(mmsize-1)
+%endif
+    ; src += 2*(height-1)*stride + 2*width
+    mov      r6d, r5m
+    dec      r6d
+    imul     r6d, r2d
+    add      r6d, r4m
+    lea       r0, [r0+r6*2]
+    ; dst += (height-1)*stride + width
+    mov      r6d, r5m
+    dec      r6d
+    imul     r6d, r3m
+    add      r6d, r4m
+    add       r1, r6
+    ; gap = stride - width
+    mov      r6d, r3m
+    sub      r6d, r4m
+    PUSH      r6
+    %define dst_gap [rsp+gprsize]
+    mov      r6d, r2d
+    sub      r6d, r4m
+    shl      r6d, 1
+    PUSH      r6
+    %define src_gap [rsp]
+%if HIGH_BIT_DEPTH
+%if cpuflag(xop)
+    mova      m6, [deinterleave_shuf32a]
+    mova      m7, [deinterleave_shuf32b]
+%else
+    pcmpeqw   m7, m7
+    psrld     m7, 16
+%endif
+.vloop:
+    mov      r6d, r4m
+%ifnidn cpuname, mmx2
+    movu      m0, [r0]
+    movu      m1, [r0+r2]
+    pavgw     m0, m1
+    pavgw     m1, [r0+r2*2]
+%endif
+.hloop:
+    sub       r0, mmsize*2
+    sub       r1, mmsize
+%ifidn cpuname, mmx2
+    SUBSAMPLEFILT8xU r1, 0
+%else
+    SUBSAMPLEFILT8xA m0, r1, 0
+%endif
+    sub      r6d, mmsize
+    jg .hloop
+%else ; !HIGH_BIT_DEPTH
+%if cpuflag(avx2)
+    mova      m7, [deinterleave_shuf]
+%elif cpuflag(xop)
+    mova      m6, [deinterleave_shuf32a]
+    mova      m7, [deinterleave_shuf32b]
+%else
+    pcmpeqb   m7, m7
+    psrlw     m7, 8
+%endif
+.vloop:
+    mov      r6d, r4m
+%ifnidn cpuname, mmx2
+%if mmsize <= 16
+    mova      m0, [r0]
+    mova      m1, [r0+r2]
+    pavgb     m0, m1
+    pavgb     m1, [r0+r2*2]
+%endif
+%endif
+.hloop:
+    sub       r0, mmsize*2
+    sub       r1, mmsize
+%if mmsize==32
+    SUBSAMPLEFILT32x4U r1
+%elifdef m8
+    SUBSAMPLEFILT8x4   m0, m1, m2, m3, m10, m11, mmsize
+    mova      m8, m0
+    mova      m9, m1
+    SUBSAMPLEFILT8x4   m2, m3, m0, m1, m4, m5, 0
+%if cpuflag(xop)
+    vpperm    m4, m2, m8, m7
+    vpperm    m2, m2, m8, m6
+%else
+    packuswb  m2, m8
+%endif
+    mova    [r1], m2
+%elifidn cpuname, mmx2
+    SUBSAMPLEFILT8x2U  r1, 0
+%else
+    SUBSAMPLEFILT16x2  m0, r1, 0
+%endif
+    sub      r6d, mmsize
+    jg .hloop
+%endif ; HIGH_BIT_DEPTH
+.skip:
+    mov       r3, dst_gap
+    sub       r0, src_gap
+    sub       r1, r3
+    dec    dword r5m
+    jg .vloop
+    ADD      rsp, 2*gprsize
+    emms
+    RET
+%endmacro ; FRAME_SUBSAMPLE_LUMA
+
+INIT_MMX mmx2
+FRAME_SUBSAMPLE_LUMA
+%if ARCH_X86_64 == 0
+INIT_MMX cache32, mmx2
+FRAME_SUBSAMPLE_LUMA
+%endif
+INIT_XMM sse2
+FRAME_SUBSAMPLE_LUMA
+INIT_XMM ssse3
+FRAME_SUBSAMPLE_LUMA
+INIT_XMM avx
+FRAME_SUBSAMPLE_LUMA
+INIT_XMM xop
+FRAME_SUBSAMPLE_LUMA
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+FRAME_SUBSAMPLE_LUMA
+%endif
+
 ;-----------------------------------------------------------------------------
 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t
*intra_costs,
 ;                             uint16_t *inter_costs, int32_t *inv_qscales,
double *fps_factor, int len )
diff --git a/source/common/x86/mc.h b/source/common/x86/mc.h
index 83b97a469..8a0564c14 100644
--- a/source/common/x86/mc.h
+++ b/source/common/x86/mc.h
@@ -36,6 +36,17 @@ LOWRES(xop)

 #undef LOWRES

+#define SUBSAMPLELUMA(cpu) \
+    void PFX(frame_subsample_luma_ ## cpu)(const pixel* src0, pixel* dst0,
intptr_t src_stride, intptr_t dst_stride, int width, int height);
+SUBSAMPLELUMA(mmx2)
+SUBSAMPLELUMA(sse2)
+SUBSAMPLELUMA(ssse3)
+SUBSAMPLELUMA(avx)
+SUBSAMPLELUMA(avx2)
+SUBSAMPLELUMA(xop)
+
+#undef SUBSAMPLELUMA
+
 #define PROPAGATE_COST(cpu) \
     void PFX(mbtree_propagate_cost_ ## cpu)(int* dst, const uint16_t*
propagateIn, const int32_t* intraCosts, \
                                               const uint16_t* interCosts,
const int32_t* invQscales, const double* fpsFactor, int len);
diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp
index 47a9913bc..c24507e5a 100644
--- a/source/encoder/encoder.cpp
+++ b/source/encoder/encoder.cpp
@@ -2703,8 +2703,10 @@ int Encoder::encode(const x265_picture* pic_in,
x265_picture* pic_out)

                 if (!*frameEnc->m_isSubSampled)
                 {
-
curEncoder->m_frameEncTF->subsampleLuma(frameEnc->m_fencPic,
frameEnc->m_fencPicSubsampled2);
-
curEncoder->m_frameEncTF->subsampleLuma(frameEnc->m_fencPicSubsampled2,
frameEnc->m_fencPicSubsampled4);
+                    primitives.frameSubSampleLuma((const pixel
*)frameEnc->m_fencPic->m_picOrg[0],frameEnc->m_fencPicSubsampled2->m_picOrg[0],
frameEnc->m_fencPic->m_stride, frameEnc->m_fencPicSubsampled2->m_stride,
frameEnc->m_fencPicSubsampled2->m_picWidth,
frameEnc->m_fencPicSubsampled2->m_picHeight);
+
extendPicBorder(frameEnc->m_fencPicSubsampled2->m_picOrg[0],
frameEnc->m_fencPicSubsampled2->m_stride,
frameEnc->m_fencPicSubsampled2->m_picWidth,
frameEnc->m_fencPicSubsampled2->m_picHeight,
frameEnc->m_fencPicSubsampled2->m_lumaMarginX,
frameEnc->m_fencPicSubsampled2->m_lumaMarginY);
+                    primitives.frameSubSampleLuma((const pixel
*)frameEnc->m_fencPicSubsampled2->m_picOrg[0],frameEnc->m_fencPicSubsampled4->m_picOrg[0],
frameEnc->m_fencPicSubsampled2->m_stride,
frameEnc->m_fencPicSubsampled4->m_stride,
frameEnc->m_fencPicSubsampled4->m_picWidth,
frameEnc->m_fencPicSubsampled4->m_picHeight);
+
extendPicBorder(frameEnc->m_fencPicSubsampled4->m_picOrg[0],
frameEnc->m_fencPicSubsampled4->m_stride,
frameEnc->m_fencPicSubsampled4->m_picWidth,
frameEnc->m_fencPicSubsampled4->m_picHeight,
frameEnc->m_fencPicSubsampled4->m_lumaMarginX,
frameEnc->m_fencPicSubsampled4->m_lumaMarginY);
                     *frameEnc->m_isSubSampled = true;
                 }

@@ -2713,8 +2715,10 @@ int Encoder::encode(const x265_picture* pic_in,
x265_picture* pic_out)
                     TemporalFilterRefPicInfo *ref =
&curEncoder->m_mcstfRefList[i - 1];
                     if (!*ref->isSubsampled)
                     {
-
curEncoder->m_frameEncTF->subsampleLuma(ref->picBuffer,
ref->picBufferSubSampled2);
-
curEncoder->m_frameEncTF->subsampleLuma(ref->picBufferSubSampled2,
ref->picBufferSubSampled4);
+                        primitives.frameSubSampleLuma((const pixel
*)ref->picBuffer->m_picOrg[0], ref->picBufferSubSampled2->m_picOrg[0],
ref->picBuffer->m_stride, ref->picBufferSubSampled2->m_stride,
ref->picBufferSubSampled2->m_picWidth,
ref->picBufferSubSampled2->m_picHeight);
+
extendPicBorder(frameEnc->m_fencPicSubsampled2->m_picOrg[0],
ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled2->m_picWidth,
ref->picBufferSubSampled2->m_picHeight,
ref->picBufferSubSampled2->m_lumaMarginX,
ref->picBufferSubSampled2->m_lumaMarginY);
+                        primitives.frameSubSampleLuma((const pixel
*)ref->picBufferSubSampled2->m_picOrg[0],ref->picBufferSubSampled4->m_picOrg[0],
ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled4->m_stride,
ref->picBufferSubSampled4->m_picWidth,
ref->picBufferSubSampled4->m_picHeight);
+
extendPicBorder(ref->picBufferSubSampled4->m_picOrg[0],
ref->picBufferSubSampled4->m_stride, ref->picBufferSubSampled4->m_picWidth,
ref->picBufferSubSampled4->m_picHeight,
ref->picBufferSubSampled4->m_lumaMarginX,
ref->picBufferSubSampled4->m_lumaMarginY);
                         *ref->isSubsampled = true;
                     }
                 }
diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp
index 6e0af1229..550521666 100644
--- a/source/test/pixelharness.cpp
+++ b/source/test/pixelharness.cpp
@@ -406,6 +406,32 @@ bool PixelHarness::check_downscale_t(downscale_t ref,
downscale_t opt)
     return true;
 }

+bool PixelHarness::check_downscaleluma_t(downscaleluma_t ref,
downscaleluma_t opt)
+{
+    ALIGN_VAR_16(pixel, ref_destf[32 * 32]);
+    ALIGN_VAR_16(pixel, opt_destf[32 * 32]);
+
+    intptr_t src_stride = 64;
+    intptr_t dst_stride = 32;
+    int bx = 32;
+    int by = 32;
+    int j = 0;
+    for (int i = 0; i < ITERS; i++)
+    {
+        int index = i % TEST_CASES;
+        ref(pixel_test_buff[index] + j, ref_destf, src_stride, dst_stride,
bx, by);
+        checked(opt, pixel_test_buff[index] + j, opt_destf, src_stride,
dst_stride, bx, by);
+
+        if (memcmp(ref_destf, opt_destf, 32 * 32 * sizeof(pixel)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref,
cpy2Dto1D_shl_t opt)
 {
     ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
@@ -2793,6 +2819,15 @@ bool PixelHarness::testCorrectness(const
EncoderPrimitives& ref, const EncoderPr
         }
     }

+    if (opt.frameSubSampleLuma)
+    {
+        if (!check_downscaleluma_t(ref.frameSubSampleLuma,
opt.frameSubSampleLuma))
+        {
+            printf("SubSample Luma failed!\n");
+            return false;
+        }
+    }
+
     if (opt.scale1D_128to64[NONALIGNED])
     {
         if (!check_scale1D_pp(ref.scale1D_128to64[NONALIGNED],
opt.scale1D_128to64[NONALIGNED]))
@@ -3492,6 +3527,12 @@ void PixelHarness::measureSpeed(const
EncoderPrimitives& ref, const EncoderPrimi
         REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2,
pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
     }

+    if (opt.frameSubSampleLuma)
+    {
+        HEADER0("downscaleluma");
+        REPORT_SPEEDUP(opt.frameSubSampleLuma, ref.frameSubSampleLuma,
pbuf2, pbuf1, 64, 64, 64, 64);
+    }
+
     if (opt.scale1D_128to64[NONALIGNED])
     {
         HEADER0("scale1D_128to64");
diff --git a/source/test/pixelharness.h b/source/test/pixelharness.h
index bf29d36a2..ee43cbeae 100644
--- a/source/test/pixelharness.h
+++ b/source/test/pixelharness.h
@@ -138,6 +138,7 @@ protected:
     bool check_integral_inith(integralh_t ref, integralh_t opt);
     bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);
     bool check_normFact(normFactor_t ref, normFactor_t opt, int block);
+    bool check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt);

 public:

-- 
2.36.0.windows.1

*Thanks and Regards,*





*Snehaa.GVideo Codec Engineer,Media & AI analytics
<https://multicorewareinc.com/>*
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20221019/d88b6648/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: mcstf_patch_13.diff
Type: application/octet-stream
Size: 24271 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20221019/d88b6648/attachment-0001.obj>