[x265] [PATCH 13/14] Add x86 ASM implementation for subsampling luma
Snehaa Giridharan
snehaa at multicorewareinc.com
Wed Oct 19 07:32:19 UTC 2022
>From 812e7949e643ac7763d80f326c5cb6c0682d2c6a Mon Sep 17 00:00:00 2001
From: Keshav E <keshav at multicorewareinc.com>
Date: Fri, 14 Oct 2022 13:48:36 +0530
Subject: [PATCH] Add x86 ASM implementation for subsampling luma
---
source/common/picyuv.cpp | 4 +-
source/common/pixel.cpp | 19 ++
source/common/primitives.h | 4 +
source/common/temporalfilter.cpp | 29 ---
source/common/temporalfilter.h | 2 -
source/common/x86/asm-primitives.cpp | 14 ++
source/common/x86/mc-a2.asm | 256 +++++++++++++++++++++++++++
source/common/x86/mc.h | 11 ++
source/encoder/encoder.cpp | 12 +-
source/test/pixelharness.cpp | 41 +++++
source/test/pixelharness.h | 1 +
11 files changed, 356 insertions(+), 37 deletions(-)
diff --git a/source/common/picyuv.cpp b/source/common/picyuv.cpp
index 2855356b5..58426a613 100644
--- a/source/common/picyuv.cpp
+++ b/source/common/picyuv.cpp
@@ -162,8 +162,8 @@ bool PicYuv::createScaledPicYUV(x265_param* param,
uint8_t scaleFactor)
uint32_t numCuInWidth = (m_picWidth + param->maxCUSize - 1) /
param->maxCUSize;
uint32_t numCuInHeight = (m_picHeight + param->maxCUSize - 1) /
param->maxCUSize;
- m_lumaMarginX = 32; // search margin for L0 and L1 ME in horizontal
direction
- m_lumaMarginY = 32; // search margin for L0 and L1 ME in vertical
direction
+ m_lumaMarginX = 128; // search margin for L0 and L1 ME in horizontal
direction
+ m_lumaMarginY = 128; // search margin for L0 and L1 ME in vertical
direction
m_stride = (numCuInWidth * param->maxCUSize) + (m_lumaMarginX << 1);
int maxHeight = numCuInHeight * param->maxCUSize;
diff --git a/source/common/pixel.cpp b/source/common/pixel.cpp
index 7a40af668..3cd074cfa 100644
--- a/source/common/pixel.cpp
+++ b/source/common/pixel.cpp
@@ -627,6 +627,23 @@ void frame_init_lowres_core(const pixel* src0, pixel*
dst0, pixel* dsth, pixel*
}
}
+static
+void frame_subsample_luma(const pixel* src0, pixel* dst0, intptr_t
src_stride, intptr_t dst_stride, int width, int height)
+{
+ for (int y = 0; y < height; y++, src0 += 2 * src_stride, dst0 +=
dst_stride)
+ {
+ const pixel *inRow = src0;
+ const pixel *inRowBelow = src0 + src_stride;
+ pixel *target = dst0;
+ for (int x = 0; x < width; x++)
+ {
+ target[x] = (((inRow[0] + inRowBelow[0] + 1) >> 1) +
((inRow[1] + inRowBelow[1] + 1) >> 1) + 1) >> 1;
+ inRow += 2;
+ inRowBelow += 2;
+ }
+ }
+}
+
/* structural similarity metric */
static void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const
pixel* pix2, intptr_t stride2, int sums[2][4])
{
@@ -1355,5 +1372,7 @@ void setupPixelPrimitives_c(EncoderPrimitives &p)
p.cu[BLOCK_16x16].normFact = normFact_c;
p.cu[BLOCK_32x32].normFact = normFact_c;
p.cu[BLOCK_64x64].normFact = normFact_c;
+ /* SubSample Luma*/
+ p.frameSubSampleLuma = frame_subsample_luma;
}
}
diff --git a/source/common/primitives.h b/source/common/primitives.h
index 7edfc315f..df1cae4b7 100644
--- a/source/common/primitives.h
+++ b/source/common/primitives.h
@@ -232,6 +232,8 @@ typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff,
int64_t *costUncoded, int
typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t
*m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t
*totalRdCost, int64_t *psyScale, uint32_t blkPos);
typedef void(*ssimDistortion_t)(const pixel *fenc, uint32_t fStride, const
pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t
*ac_k);
typedef void(*normFactor_t)(const pixel *src, uint32_t blockSize, int
shift, uint64_t *z_k);
+/* SubSampling Luma */
+typedef void (*downscaleluma_t)(const pixel* src0, pixel* dstf, intptr_t
src_stride, intptr_t dst_stride, int width, int height);
/* Function pointers to optimized encoder primitives. Each pointer can
reference
* either an assembly routine, a SIMD intrinsic primitive, or a C function
*/
struct EncoderPrimitives
@@ -353,6 +355,8 @@ struct EncoderPrimitives
downscale_t frameInitLowres;
downscale_t frameInitLowerRes;
+ /* Sub Sample Luma */
+ downscaleluma_t frameSubSampleLuma;
cutree_propagate_cost propagateCost;
cutree_fix8_unpack fix8Unpack;
cutree_fix8_pack fix8Pack;
diff --git a/source/common/temporalfilter.cpp
b/source/common/temporalfilter.cpp
index 72edaaac1..d178c5800 100644
--- a/source/common/temporalfilter.cpp
+++ b/source/common/temporalfilter.cpp
@@ -1213,35 +1213,6 @@ void
TemporalFilter::motionEstimationLumaDoubleRes(MV *mvs, uint32_t mvStride, P
}
#endif
-void TemporalFilter::subsampleLuma(PicYuv *input, PicYuv *output, int
factor)
-{
-
- int newWidth = output->m_picWidth;
- int newHeight = output->m_picHeight;
-
- pixel* srcRow = input->m_picOrg[0];
- intptr_t srcStride = input->m_stride;
-
- pixel *dstRow = output->m_picOrg[0];
- intptr_t dstStride = output->m_stride;
-
- for (int y = 0; y < newHeight; y++, srcRow += factor * srcStride,
dstRow += dstStride)
- {
- pixel *inRow = srcRow;
- pixel *inRowBelow = srcRow + srcStride;
- pixel *target = dstRow;
-
- for (int x = 0; x < newWidth; x++)
- {
- target[x] = (inRow[0] + inRowBelow[0] + inRow[1] +
inRowBelow[1] + 2) >> 2;
- inRow += 2;
- inRowBelow += 2;
- }
- }
-
- extendPicBorder(output->m_picOrg[0], output->m_stride,
output->m_picWidth, output->m_picHeight, output->m_lumaMarginX,
output->m_lumaMarginY);
-}
-
void TemporalFilter::destroyRefPicInfo(TemporalFilterRefPicInfo* curFrame)
{
if (curFrame)
diff --git a/source/common/temporalfilter.h b/source/common/temporalfilter.h
index f49fae3c5..55ae6736d 100644
--- a/source/common/temporalfilter.h
+++ b/source/common/temporalfilter.h
@@ -162,8 +162,6 @@ public:
Yuv predPUYuv;
int m_useSADinME;
- void subsampleLuma(PicYuv *input, PicYuv *output, int factor = 2);
-
int createRefPicInfo(TemporalFilterRefPicInfo* refFrame, x265_param*
param);
void bilateralFilter(Frame* frame, TemporalFilterRefPicInfo*
mctfRefList, double overallStrength);
diff --git a/source/common/x86/asm-primitives.cpp
b/source/common/x86/asm-primitives.cpp
index 5cad50f07..bb32f32cd 100644
--- a/source/common/x86/asm-primitives.cpp
+++ b/source/common/x86/asm-primitives.cpp
@@ -1091,6 +1091,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main10
p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
// TODO: the planecopy_sp is really planecopy_SC now, must be fix
it
//p.planecopy_sp = PFX(downShift_16_sse2);
p.planecopy_sp_shl = PFX(upShift_16_sse2);
@@ -1121,6 +1122,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main10
{
ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
// p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d =
PFX(pixel_satd_4x4_ssse3); this one is broken
ALL_LUMA_PU(satd, pixel_satd, ssse3);
@@ -1462,6 +1464,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main10
p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
p.propagateCost = PFX(mbtree_propagate_cost_avx);
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
}
if (cpuMask & X265_CPU_XOP)
{
@@ -1473,6 +1476,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main10
LUMA_VAR(xop);
p.frameInitLowres = PFX(frame_init_lowres_core_xop);
p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
}
if (cpuMask & X265_CPU_AVX2)
{
@@ -2301,6 +2305,9 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main10
p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
p.frameInitLowerRes = PFX(frame_init_lowres_core_avx2);
+
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
+
p.propagateCost = PFX(mbtree_propagate_cost_avx2);
p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
p.fix8Pack = PFX(cutree_fix8_pack_avx2);
@@ -3300,6 +3307,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main
//p.frameInitLowres = PFX(frame_init_lowres_core_mmx2);
p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_sse2);
ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, sse2);
ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);
@@ -3424,6 +3432,8 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main
ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_ssse3);
+
ASSIGN2(p.pu[LUMA_8x4].convert_p2s, filterPixelToShort_8x4_ssse3);
ASSIGN2(p.pu[LUMA_8x8].convert_p2s, filterPixelToShort_8x8_ssse3);
ASSIGN2(p.pu[LUMA_8x16].convert_p2s,
filterPixelToShort_8x16_ssse3);
@@ -3691,6 +3701,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main
p.frameInitLowres = PFX(frame_init_lowres_core_avx);
p.frameInitLowerRes = PFX(frame_init_lowres_core_avx);
p.propagateCost = PFX(mbtree_propagate_cost_avx);
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_avx);
}
if (cpuMask & X265_CPU_XOP)
{
@@ -3702,6 +3713,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main
p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_xop);
p.frameInitLowres = PFX(frame_init_lowres_core_xop);
p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_xop);
}
#if X86_64
@@ -4684,6 +4696,8 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
int cpuMask) // Main
p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
p.saoCuStatsE3 = PFX(saoCuStatsE3_avx2);
+ p.frameSubSampleLuma = PFX(frame_subsample_luma_avx2);
+
if (cpuMask & X265_CPU_BMI2)
{
p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
diff --git a/source/common/x86/mc-a2.asm b/source/common/x86/mc-a2.asm
index fc36b2cc8..94a995ee0 100644
--- a/source/common/x86/mc-a2.asm
+++ b/source/common/x86/mc-a2.asm
@@ -992,6 +992,262 @@ INIT_YMM avx2
FRAME_INIT_LOWRES
%endif
+%macro SUBSAMPLEFILT8x4 7
+ mova %3, [r0+%7]
+ mova %4, [r0+r2+%7]
+ pavgb %3, %4
+ pavgb %4, [r0+r2*2+%7]
+ PALIGNR %1, %3, 1, m6
+ PALIGNR %2, %4, 1, m6
+%if cpuflag(xop)
+ pavgb %1, %3
+ pavgb %2, %4
+%else
+ pavgb %1, %3
+ pavgb %2, %4
+ psrlw %5, %1, 8
+ psrlw %6, %2, 8
+ pand %1, m7
+ pand %2, m7
+%endif
+%endmacro
+
+%macro SUBSAMPLEFILT32x4U 1
+ movu m1, [r0+r2]
+ pavgb m0, m1, [r0]
+ movu m3, [r0+r2+1]
+ pavgb m2, m3, [r0+1]
+ pavgb m1, [r0+r2*2]
+ pavgb m3, [r0+r2*2+1]
+ pavgb m0, m2
+ pavgb m1, m3
+
+ movu m3, [r0+r2+mmsize]
+ pavgb m2, m3, [r0+mmsize]
+ movu m5, [r0+r2+1+mmsize]
+ pavgb m4, m5, [r0+1+mmsize]
+ pavgb m2, m4
+
+ pshufb m0, m7
+ pshufb m2, m7
+ punpcklqdq m0, m0, m2
+ vpermq m0, m0, q3120
+ movu [%1], m0
+%endmacro
+
+%macro SUBSAMPLEFILT16x2 3
+ mova m3, [r0+%3+mmsize]
+ mova m2, [r0+%3]
+ pavgb m3, [r0+%3+r2+mmsize]
+ pavgb m2, [r0+%3+r2]
+ PALIGNR %1, m3, 1, m6
+ pavgb %1, m3
+ PALIGNR m3, m2, 1, m6
+ pavgb m3, m2
+%if cpuflag(xop)
+ vpperm m3, m3, %1, m6
+%else
+ pand m3, m7
+ pand %1, m7
+ packuswb m3, %1
+%endif
+ mova [%2], m3
+ mova %1, m2
+%endmacro
+
+%macro SUBSAMPLEFILT8x2U 2
+ mova m2, [r0+%2]
+ pavgb m2, [r0+%2+r2]
+ mova m0, [r0+%2+1]
+ pavgb m0, [r0+%2+r2+1]
+ pavgb m1, m3
+ pavgb m0, m2
+ pand m1, m7
+ pand m0, m7
+ packuswb m0, m1
+ mova [%1], m0
+%endmacro
+
+%macro SUBSAMPLEFILT8xU 2
+ mova m3, [r0+%2+8]
+ mova m2, [r0+%2]
+ pavgw m3, [r0+%2+r2+8]
+ pavgw m2, [r0+%2+r2]
+ movu m1, [r0+%2+10]
+ movu m0, [r0+%2+2]
+ pavgw m1, [r0+%2+r2+10]
+ pavgw m0, [r0+%2+r2+2]
+ pavgw m1, m3
+ pavgw m0, m2
+ psrld m3, m1, 16
+ pand m1, m7
+ pand m0, m7
+ packssdw m0, m1
+ movu [%1], m0
+%endmacro
+
+%macro SUBSAMPLEFILT8xA 3
+ movu m3, [r0+%3+mmsize]
+ movu m2, [r0+%3]
+ pavgw m3, [r0+%3+r2+mmsize]
+ pavgw m2, [r0+%3+r2]
+ PALIGNR %1, m3, 2, m6
+ pavgw %1, m3
+ PALIGNR m3, m2, 2, m6
+ pavgw m3, m2
+%if cpuflag(xop)
+ vpperm m3, m3, %1, m6
+%else
+ pand m3, m7
+ pand %1, m7
+ packssdw m3, %1
+%endif
+%if cpuflag(avx2)
+ vpermq m3, m3, q3120
+%endif
+ movu [%2], m3
+ movu %1, m2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void frame_subsample_luma( uint8_t *src0, uint8_t *dst0,
+; intptr_t src_stride, intptr_t dst_stride,
int width, int height )
+;-----------------------------------------------------------------------------
+
+%macro FRAME_SUBSAMPLE_LUMA 0
+cglobal frame_subsample_luma, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for
HIGH_BIT_DEPTH, 12 otherwise
+%if HIGH_BIT_DEPTH
+ shl dword r3m, 1
+ FIX_STRIDES r2
+ shl dword r4m, 1
+%endif
+%if mmsize >= 16
+ add dword r4m, mmsize-1
+ and dword r4m, ~(mmsize-1)
+%endif
+ ; src += 2*(height-1)*stride + 2*width
+ mov r6d, r5m
+ dec r6d
+ imul r6d, r2d
+ add r6d, r4m
+ lea r0, [r0+r6*2]
+ ; dst += (height-1)*stride + width
+ mov r6d, r5m
+ dec r6d
+ imul r6d, r3m
+ add r6d, r4m
+ add r1, r6
+ ; gap = stride - width
+ mov r6d, r3m
+ sub r6d, r4m
+ PUSH r6
+ %define dst_gap [rsp+gprsize]
+ mov r6d, r2d
+ sub r6d, r4m
+ shl r6d, 1
+ PUSH r6
+ %define src_gap [rsp]
+%if HIGH_BIT_DEPTH
+%if cpuflag(xop)
+ mova m6, [deinterleave_shuf32a]
+ mova m7, [deinterleave_shuf32b]
+%else
+ pcmpeqw m7, m7
+ psrld m7, 16
+%endif
+.vloop:
+ mov r6d, r4m
+%ifnidn cpuname, mmx2
+ movu m0, [r0]
+ movu m1, [r0+r2]
+ pavgw m0, m1
+ pavgw m1, [r0+r2*2]
+%endif
+.hloop:
+ sub r0, mmsize*2
+ sub r1, mmsize
+%ifidn cpuname, mmx2
+ SUBSAMPLEFILT8xU r1, 0
+%else
+ SUBSAMPLEFILT8xA m0, r1, 0
+%endif
+ sub r6d, mmsize
+ jg .hloop
+%else ; !HIGH_BIT_DEPTH
+%if cpuflag(avx2)
+ mova m7, [deinterleave_shuf]
+%elif cpuflag(xop)
+ mova m6, [deinterleave_shuf32a]
+ mova m7, [deinterleave_shuf32b]
+%else
+ pcmpeqb m7, m7
+ psrlw m7, 8
+%endif
+.vloop:
+ mov r6d, r4m
+%ifnidn cpuname, mmx2
+%if mmsize <= 16
+ mova m0, [r0]
+ mova m1, [r0+r2]
+ pavgb m0, m1
+ pavgb m1, [r0+r2*2]
+%endif
+%endif
+.hloop:
+ sub r0, mmsize*2
+ sub r1, mmsize
+%if mmsize==32
+ SUBSAMPLEFILT32x4U r1
+%elifdef m8
+ SUBSAMPLEFILT8x4 m0, m1, m2, m3, m10, m11, mmsize
+ mova m8, m0
+ mova m9, m1
+ SUBSAMPLEFILT8x4 m2, m3, m0, m1, m4, m5, 0
+%if cpuflag(xop)
+ vpperm m4, m2, m8, m7
+ vpperm m2, m2, m8, m6
+%else
+ packuswb m2, m8
+%endif
+ mova [r1], m2
+%elifidn cpuname, mmx2
+ SUBSAMPLEFILT8x2U r1, 0
+%else
+ SUBSAMPLEFILT16x2 m0, r1, 0
+%endif
+ sub r6d, mmsize
+ jg .hloop
+%endif ; HIGH_BIT_DEPTH
+.skip:
+ mov r3, dst_gap
+ sub r0, src_gap
+ sub r1, r3
+ dec dword r5m
+ jg .vloop
+ ADD rsp, 2*gprsize
+ emms
+ RET
+%endmacro ; FRAME_SUBSAMPLE_LUMA
+
+INIT_MMX mmx2
+FRAME_SUBSAMPLE_LUMA
+%if ARCH_X86_64 == 0
+INIT_MMX cache32, mmx2
+FRAME_SUBSAMPLE_LUMA
+%endif
+INIT_XMM sse2
+FRAME_SUBSAMPLE_LUMA
+INIT_XMM ssse3
+FRAME_SUBSAMPLE_LUMA
+INIT_XMM avx
+FRAME_SUBSAMPLE_LUMA
+INIT_XMM xop
+FRAME_SUBSAMPLE_LUMA
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+FRAME_SUBSAMPLE_LUMA
+%endif
+
;-----------------------------------------------------------------------------
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t
*intra_costs,
; uint16_t *inter_costs, int32_t *inv_qscales,
double *fps_factor, int len )
diff --git a/source/common/x86/mc.h b/source/common/x86/mc.h
index 83b97a469..8a0564c14 100644
--- a/source/common/x86/mc.h
+++ b/source/common/x86/mc.h
@@ -36,6 +36,17 @@ LOWRES(xop)
#undef LOWRES
+#define SUBSAMPLELUMA(cpu) \
+ void PFX(frame_subsample_luma_ ## cpu)(const pixel* src0, pixel* dst0,
intptr_t src_stride, intptr_t dst_stride, int width, int height);
+SUBSAMPLELUMA(mmx2)
+SUBSAMPLELUMA(sse2)
+SUBSAMPLELUMA(ssse3)
+SUBSAMPLELUMA(avx)
+SUBSAMPLELUMA(avx2)
+SUBSAMPLELUMA(xop)
+
+#undef SUBSAMPLELUMA
+
#define PROPAGATE_COST(cpu) \
void PFX(mbtree_propagate_cost_ ## cpu)(int* dst, const uint16_t*
propagateIn, const int32_t* intraCosts, \
const uint16_t* interCosts,
const int32_t* invQscales, const double* fpsFactor, int len);
diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp
index 47a9913bc..c24507e5a 100644
--- a/source/encoder/encoder.cpp
+++ b/source/encoder/encoder.cpp
@@ -2703,8 +2703,10 @@ int Encoder::encode(const x265_picture* pic_in,
x265_picture* pic_out)
if (!*frameEnc->m_isSubSampled)
{
-
curEncoder->m_frameEncTF->subsampleLuma(frameEnc->m_fencPic,
frameEnc->m_fencPicSubsampled2);
-
curEncoder->m_frameEncTF->subsampleLuma(frameEnc->m_fencPicSubsampled2,
frameEnc->m_fencPicSubsampled4);
+ primitives.frameSubSampleLuma((const pixel
*)frameEnc->m_fencPic->m_picOrg[0],frameEnc->m_fencPicSubsampled2->m_picOrg[0],
frameEnc->m_fencPic->m_stride, frameEnc->m_fencPicSubsampled2->m_stride,
frameEnc->m_fencPicSubsampled2->m_picWidth,
frameEnc->m_fencPicSubsampled2->m_picHeight);
+
extendPicBorder(frameEnc->m_fencPicSubsampled2->m_picOrg[0],
frameEnc->m_fencPicSubsampled2->m_stride,
frameEnc->m_fencPicSubsampled2->m_picWidth,
frameEnc->m_fencPicSubsampled2->m_picHeight,
frameEnc->m_fencPicSubsampled2->m_lumaMarginX,
frameEnc->m_fencPicSubsampled2->m_lumaMarginY);
+ primitives.frameSubSampleLuma((const pixel
*)frameEnc->m_fencPicSubsampled2->m_picOrg[0],frameEnc->m_fencPicSubsampled4->m_picOrg[0],
frameEnc->m_fencPicSubsampled2->m_stride,
frameEnc->m_fencPicSubsampled4->m_stride,
frameEnc->m_fencPicSubsampled4->m_picWidth,
frameEnc->m_fencPicSubsampled4->m_picHeight);
+
extendPicBorder(frameEnc->m_fencPicSubsampled4->m_picOrg[0],
frameEnc->m_fencPicSubsampled4->m_stride,
frameEnc->m_fencPicSubsampled4->m_picWidth,
frameEnc->m_fencPicSubsampled4->m_picHeight,
frameEnc->m_fencPicSubsampled4->m_lumaMarginX,
frameEnc->m_fencPicSubsampled4->m_lumaMarginY);
*frameEnc->m_isSubSampled = true;
}
@@ -2713,8 +2715,10 @@ int Encoder::encode(const x265_picture* pic_in,
x265_picture* pic_out)
TemporalFilterRefPicInfo *ref =
&curEncoder->m_mcstfRefList[i - 1];
if (!*ref->isSubsampled)
{
-
curEncoder->m_frameEncTF->subsampleLuma(ref->picBuffer,
ref->picBufferSubSampled2);
-
curEncoder->m_frameEncTF->subsampleLuma(ref->picBufferSubSampled2,
ref->picBufferSubSampled4);
+ primitives.frameSubSampleLuma((const pixel
*)ref->picBuffer->m_picOrg[0], ref->picBufferSubSampled2->m_picOrg[0],
ref->picBuffer->m_stride, ref->picBufferSubSampled2->m_stride,
ref->picBufferSubSampled2->m_picWidth,
ref->picBufferSubSampled2->m_picHeight);
+
extendPicBorder(frameEnc->m_fencPicSubsampled2->m_picOrg[0],
ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled2->m_picWidth,
ref->picBufferSubSampled2->m_picHeight,
ref->picBufferSubSampled2->m_lumaMarginX,
ref->picBufferSubSampled2->m_lumaMarginY);
+ primitives.frameSubSampleLuma((const pixel
*)ref->picBufferSubSampled2->m_picOrg[0],ref->picBufferSubSampled4->m_picOrg[0],
ref->picBufferSubSampled2->m_stride, ref->picBufferSubSampled4->m_stride,
ref->picBufferSubSampled4->m_picWidth,
ref->picBufferSubSampled4->m_picHeight);
+
extendPicBorder(ref->picBufferSubSampled4->m_picOrg[0],
ref->picBufferSubSampled4->m_stride, ref->picBufferSubSampled4->m_picWidth,
ref->picBufferSubSampled4->m_picHeight,
ref->picBufferSubSampled4->m_lumaMarginX,
ref->picBufferSubSampled4->m_lumaMarginY);
*ref->isSubsampled = true;
}
}
diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp
index 6e0af1229..550521666 100644
--- a/source/test/pixelharness.cpp
+++ b/source/test/pixelharness.cpp
@@ -406,6 +406,32 @@ bool PixelHarness::check_downscale_t(downscale_t ref,
downscale_t opt)
return true;
}
+bool PixelHarness::check_downscaleluma_t(downscaleluma_t ref,
downscaleluma_t opt)
+{
+ ALIGN_VAR_16(pixel, ref_destf[32 * 32]);
+ ALIGN_VAR_16(pixel, opt_destf[32 * 32]);
+
+ intptr_t src_stride = 64;
+ intptr_t dst_stride = 32;
+ int bx = 32;
+ int by = 32;
+ int j = 0;
+ for (int i = 0; i < ITERS; i++)
+ {
+ int index = i % TEST_CASES;
+ ref(pixel_test_buff[index] + j, ref_destf, src_stride, dst_stride,
bx, by);
+ checked(opt, pixel_test_buff[index] + j, opt_destf, src_stride,
dst_stride, bx, by);
+
+ if (memcmp(ref_destf, opt_destf, 32 * 32 * sizeof(pixel)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref,
cpy2Dto1D_shl_t opt)
{
ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
@@ -2793,6 +2819,15 @@ bool PixelHarness::testCorrectness(const
EncoderPrimitives& ref, const EncoderPr
}
}
+ if (opt.frameSubSampleLuma)
+ {
+ if (!check_downscaleluma_t(ref.frameSubSampleLuma,
opt.frameSubSampleLuma))
+ {
+ printf("SubSample Luma failed!\n");
+ return false;
+ }
+ }
+
if (opt.scale1D_128to64[NONALIGNED])
{
if (!check_scale1D_pp(ref.scale1D_128to64[NONALIGNED],
opt.scale1D_128to64[NONALIGNED]))
@@ -3492,6 +3527,12 @@ void PixelHarness::measureSpeed(const
EncoderPrimitives& ref, const EncoderPrimi
REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2,
pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
}
+ if (opt.frameSubSampleLuma)
+ {
+ HEADER0("downscaleluma");
+ REPORT_SPEEDUP(opt.frameSubSampleLuma, ref.frameSubSampleLuma,
pbuf2, pbuf1, 64, 64, 64, 64);
+ }
+
if (opt.scale1D_128to64[NONALIGNED])
{
HEADER0("scale1D_128to64");
diff --git a/source/test/pixelharness.h b/source/test/pixelharness.h
index bf29d36a2..ee43cbeae 100644
--- a/source/test/pixelharness.h
+++ b/source/test/pixelharness.h
@@ -138,6 +138,7 @@ protected:
bool check_integral_inith(integralh_t ref, integralh_t opt);
bool check_ssimDist(ssimDistortion_t ref, ssimDistortion_t opt);
bool check_normFact(normFactor_t ref, normFactor_t opt, int block);
+ bool check_downscaleluma_t(downscaleluma_t ref, downscaleluma_t opt);
public:
--
2.36.0.windows.1
*Thanks and Regards,*
*Snehaa.GVideo Codec Engineer,Media & AI analytics
<https://multicorewareinc.com/>*
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20221019/d88b6648/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: mcstf_patch_13.diff
Type: application/octet-stream
Size: 24271 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20221019/d88b6648/attachment-0001.obj>
More information about the x265-devel
mailing list