[x265] [PATCH] asm: new primivite planeClipAndMax for Clip Luma to custom range and statistics MaxLumaLeve
Steve Borho
steve at borho.org
Sun Aug 16 11:44:55 CEST 2015
On 08/14, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1439590062 25200
> # Node ID a10af57003fc3044d0d4c000290faeda19a81777
> # Parent d56b2466c04459205287e1581d8a36eebf372ba6
> asm: new primivite planeClipAndMax for Clip Luma to custom range and statistics MaxLumaLeve
this adds assembly, but doesn't setup the function pointer for it, nor
is there a testbench.
> ---
> source/common/picyuv.cpp | 16 ++------
> source/common/pixel.cpp | 26 +++++++++++++
> source/common/primitives.h | 2 +
> source/common/x86/const-a.asm | 5 +++
> source/common/x86/pixel-a.asm | 78 +++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 115 insertions(+), 12 deletions(-)
>
> diff -r d56b2466c044 -r a10af57003fc source/common/picyuv.cpp
> --- a/source/common/picyuv.cpp Wed Aug 12 18:12:20 2015 +0530
> +++ b/source/common/picyuv.cpp Fri Aug 14 15:07:42 2015 -0700
> @@ -237,25 +237,17 @@
> pixel *U = m_picOrg[1];
> pixel *V = m_picOrg[2];
>
> - uint64_t sumLuma = 0;
> + uint64_t sumLuma;
> + m_maxLumaLevel = primitives.planeClipAndMax(Y, m_stride, width, height, &sumLuma, (pixel)param.minLuma, (pixel)param.maxLuma);
> + m_avgLumaLevel = (double)(sumLuma) / (m_picHeight * m_picWidth);
> +
> for (int r = 0; r < height; r++)
> {
> - for (int c = 0; c < width; c++)
> - {
> - /* Clip luma of source picture to max and min values before extending edges of picYuv */
> - Y[c] = x265_clip3((pixel)param.minLuma, (pixel)param.maxLuma, Y[c]);
> -
> - /* Determine maximum and average luma level in a picture */
> - m_maxLumaLevel = X265_MAX(Y[c], m_maxLumaLevel);
> - sumLuma += Y[c];
> - }
> -
> for (int x = 0; x < padx; x++)
> Y[width + x] = Y[width - 1];
>
> Y += m_stride;
> }
> - m_avgLumaLevel = (double)(sumLuma) / (m_picHeight * m_picWidth);
>
> for (int r = 0; r < height >> m_vChromaShift; r++)
> {
> diff -r d56b2466c044 -r a10af57003fc source/common/pixel.cpp
> --- a/source/common/pixel.cpp Wed Aug 12 18:12:20 2015 +0530
> +++ b/source/common/pixel.cpp Fri Aug 14 15:07:42 2015 -0700
> @@ -973,6 +973,31 @@
> dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
> }
> }
> +
> +static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix)
> +{
> + pixel maxLumaLevel = 0;
> + uint64_t sumLuma = 0;
> +
> + for (int r = 0; r < height; r++)
> + {
> + for (int c = 0; c < width; c++)
> + {
> + /* Clip luma of source picture to max and min values before extending edges of picYuv */
> + src[c] = x265_clip3((pixel)minPix, (pixel)maxPix, src[c]);
> +
> + /* Determine maximum and average luma level in a picture */
> + maxLumaLevel = X265_MAX(src[c], maxLumaLevel);
> + sumLuma += src[c];
> + }
> +
> + src += stride;
> + }
> +
> + *outsum = sumLuma;
> + return maxLumaLevel;
> +}
> +
> } // end anonymous namespace
>
> namespace X265_NS {
> @@ -1258,6 +1283,7 @@
> p.planecopy_cp = planecopy_cp_c;
> p.planecopy_sp = planecopy_sp_c;
> p.planecopy_sp_shl = planecopy_sp_shl_c;
> + p.planeClipAndMax = planeClipAndMax_c;
> p.propagateCost = estimateCUPropagateCost;
> }
> }
> diff -r d56b2466c044 -r a10af57003fc source/common/primitives.h
> --- a/source/common/primitives.h Wed Aug 12 18:12:20 2015 +0530
> +++ b/source/common/primitives.h Fri Aug 14 15:07:42 2015 -0700
> @@ -185,6 +185,7 @@
> typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
> typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
> typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
> +typedef pixel (*planeClipAndMax_t)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
>
> typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
>
> @@ -316,6 +317,7 @@
> planecopy_cp_t planecopy_cp;
> planecopy_sp_t planecopy_sp;
> planecopy_sp_t planecopy_sp_shl;
> + planeClipAndMax_t planeClipAndMax;
>
> weightp_sp_t weight_sp;
> weightp_pp_t weight_pp;
> diff -r d56b2466c044 -r a10af57003fc source/common/x86/const-a.asm
> --- a/source/common/x86/const-a.asm Wed Aug 12 18:12:20 2015 +0530
> +++ b/source/common/x86/const-a.asm Fri Aug 14 15:07:42 2015 -0700
> @@ -54,6 +54,11 @@
> const pb_shuf8x8c, times 1 db 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6
> const pb_movemask, times 16 db 0x00
> times 16 db 0xFF
> +
> +const pb_movemask_32, times 32 db 0x00
> + times 32 db 0xFF
> + times 32 db 0x00
> +
> const pb_0000000000000F0F, times 2 db 0xff, 0x00
> times 12 db 0x00
> const pb_000000000000000F, db 0xff
> diff -r d56b2466c044 -r a10af57003fc source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm Wed Aug 12 18:12:20 2015 +0530
> +++ b/source/common/x86/pixel-a.asm Fri Aug 14 15:07:42 2015 -0700
> @@ -70,6 +70,7 @@
> cextern pd_2
> cextern hmul_16p
> cextern pb_movemask
> +cextern pb_movemask_32
> cextern pw_pixel_max
>
> ;=============================================================================
> @@ -12493,3 +12494,80 @@
> movd eax, xm6
> RET
> %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
> +
> +
> +;-------------------------------------------------------------------------------------------------------------------------------------
> +; pixel planeClipAndMax(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix)
> +;-------------------------------------------------------------------------------------------------------------------------------------
> +%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
> +INIT_YMM avx2
> +cglobal planeClipAndMax, 5,7,8
> + movd xm0, r5m
> + vpbroadcastb m0, xm0 ; m0 = [min]
> + vpbroadcastb m1, r6m ; m1 = [max]
> + pxor m2, m2 ; m2 = sumLuma
> + pxor m3, m3 ; m3 = maxLumaLevel
> + pxor m4, m4 ; m4 = zero
> +
> + ; get mask to partial register pixels
> + mov r5d, r2d
> + and r2d, ~(mmsize - 1)
> + sub r5d, r2d
> + lea r6, [pb_movemask_32 + mmsize]
> + sub r6, r5
> + movu m5, [r6] ; m5 = mask for last couple column
> +
> +.loopH:
> + lea r5d, [r2 - mmsize]
> +
> +.loopW:
> + movu m6, [r0 + r5]
> + pmaxub m6, m0
> + pminub m6, m1
> + movu [r0 + r5], m6 ; store back
> + pmaxub m3, m6 ; update maxLumaLevel
> + psadbw m6, m4
> + paddq m2, m6
> +
> + sub r5d, mmsize
> + jge .loopW
> +
> + ; partial pixels
> + movu m7, [r0 + r2]
> + pmaxub m6, m7, m0
> + pminub m6, m1
> +
> + pand m7, m5 ; get invalid/unchange pixel
> + pandn m6, m5, m6 ; clear invalid pixels
> + por m7, m6 ; combin valid & invalid pixels
> + movu [r0 + r2], m7 ; store back
> + pmaxub m3, m6 ; update maxLumaLevel
> + psadbw m6, m4
> + paddq m2, m6
> +
> +.next:
> + add r0, r1
> + dec r3d
> + jg .loopH
> +
> + ; sumLuma
> + vextracti128 xm0, m2, 1
> + paddq xm0, xm2
> + movhlps xm1, xm0
> + paddq xm0, xm1
> + movq [r4], xm0
> +
> + ; maxLumaLevel
> + vextracti128 xm0, m3, 1
> + pmaxub xm0, xm3
> + movhlps xm3, xm0
> + pmaxub xm0, xm3
> + pmovzxbw xm0, xm0
> + pxor xm0, [pb_movemask + 16]
> + phminposuw xm0, xm0
> +
> + movd eax, xm0
> + not al
> + movzx eax, al
> + RET
> +%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list