[x265] [PATCH] asm: new primivite planeClipAndMax for Clip Luma to custom range and statistics MaxLumaLeve
Min Chen
chenm003 at 163.com
Sat Aug 15 00:07:49 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1439590062 25200
# Node ID a10af57003fc3044d0d4c000290faeda19a81777
# Parent d56b2466c04459205287e1581d8a36eebf372ba6
asm: new primivite planeClipAndMax for Clip Luma to custom range and statistics MaxLumaLeve
---
source/common/picyuv.cpp | 16 ++------
source/common/pixel.cpp | 26 +++++++++++++
source/common/primitives.h | 2 +
source/common/x86/const-a.asm | 5 +++
source/common/x86/pixel-a.asm | 78 +++++++++++++++++++++++++++++++++++++++++
5 files changed, 115 insertions(+), 12 deletions(-)
diff -r d56b2466c044 -r a10af57003fc source/common/picyuv.cpp
--- a/source/common/picyuv.cpp Wed Aug 12 18:12:20 2015 +0530
+++ b/source/common/picyuv.cpp Fri Aug 14 15:07:42 2015 -0700
@@ -237,25 +237,17 @@
pixel *U = m_picOrg[1];
pixel *V = m_picOrg[2];
- uint64_t sumLuma = 0;
+ uint64_t sumLuma;
+ m_maxLumaLevel = primitives.planeClipAndMax(Y, m_stride, width, height, &sumLuma, (pixel)param.minLuma, (pixel)param.maxLuma);
+ m_avgLumaLevel = (double)(sumLuma) / (m_picHeight * m_picWidth);
+
for (int r = 0; r < height; r++)
{
- for (int c = 0; c < width; c++)
- {
- /* Clip luma of source picture to max and min values before extending edges of picYuv */
- Y[c] = x265_clip3((pixel)param.minLuma, (pixel)param.maxLuma, Y[c]);
-
- /* Determine maximum and average luma level in a picture */
- m_maxLumaLevel = X265_MAX(Y[c], m_maxLumaLevel);
- sumLuma += Y[c];
- }
-
for (int x = 0; x < padx; x++)
Y[width + x] = Y[width - 1];
Y += m_stride;
}
- m_avgLumaLevel = (double)(sumLuma) / (m_picHeight * m_picWidth);
for (int r = 0; r < height >> m_vChromaShift; r++)
{
diff -r d56b2466c044 -r a10af57003fc source/common/pixel.cpp
--- a/source/common/pixel.cpp Wed Aug 12 18:12:20 2015 +0530
+++ b/source/common/pixel.cpp Fri Aug 14 15:07:42 2015 -0700
@@ -973,6 +973,31 @@
dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
}
}
+
+static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix)
+{
+ pixel maxLumaLevel = 0;
+ uint64_t sumLuma = 0;
+
+ for (int r = 0; r < height; r++)
+ {
+ for (int c = 0; c < width; c++)
+ {
+ /* Clip luma of source picture to max and min values before extending edges of picYuv */
+ src[c] = x265_clip3((pixel)minPix, (pixel)maxPix, src[c]);
+
+ /* Determine maximum and average luma level in a picture */
+ maxLumaLevel = X265_MAX(src[c], maxLumaLevel);
+ sumLuma += src[c];
+ }
+
+ src += stride;
+ }
+
+ *outsum = sumLuma;
+ return maxLumaLevel;
+}
+
} // end anonymous namespace
namespace X265_NS {
@@ -1258,6 +1283,7 @@
p.planecopy_cp = planecopy_cp_c;
p.planecopy_sp = planecopy_sp_c;
p.planecopy_sp_shl = planecopy_sp_shl_c;
+ p.planeClipAndMax = planeClipAndMax_c;
p.propagateCost = estimateCUPropagateCost;
}
}
diff -r d56b2466c044 -r a10af57003fc source/common/primitives.h
--- a/source/common/primitives.h Wed Aug 12 18:12:20 2015 +0530
+++ b/source/common/primitives.h Fri Aug 14 15:07:42 2015 -0700
@@ -185,6 +185,7 @@
typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+typedef pixel (*planeClipAndMax_t)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
@@ -316,6 +317,7 @@
planecopy_cp_t planecopy_cp;
planecopy_sp_t planecopy_sp;
planecopy_sp_t planecopy_sp_shl;
+ planeClipAndMax_t planeClipAndMax;
weightp_sp_t weight_sp;
weightp_pp_t weight_pp;
diff -r d56b2466c044 -r a10af57003fc source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Wed Aug 12 18:12:20 2015 +0530
+++ b/source/common/x86/const-a.asm Fri Aug 14 15:07:42 2015 -0700
@@ -54,6 +54,11 @@
const pb_shuf8x8c, times 1 db 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6
const pb_movemask, times 16 db 0x00
times 16 db 0xFF
+
+const pb_movemask_32, times 32 db 0x00
+ times 32 db 0xFF
+ times 32 db 0x00
+
const pb_0000000000000F0F, times 2 db 0xff, 0x00
times 12 db 0x00
const pb_000000000000000F, db 0xff
diff -r d56b2466c044 -r a10af57003fc source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Aug 12 18:12:20 2015 +0530
+++ b/source/common/x86/pixel-a.asm Fri Aug 14 15:07:42 2015 -0700
@@ -70,6 +70,7 @@
cextern pd_2
cextern hmul_16p
cextern pb_movemask
+cextern pb_movemask_32
cextern pw_pixel_max
;=============================================================================
@@ -12493,3 +12494,80 @@
movd eax, xm6
RET
%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
+
+
+;-------------------------------------------------------------------------------------------------------------------------------------
+; pixel planeClipAndMax(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix)
+;-------------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
+INIT_YMM avx2
+cglobal planeClipAndMax, 5,7,8
+ movd xm0, r5m
+ vpbroadcastb m0, xm0 ; m0 = [min]
+ vpbroadcastb m1, r6m ; m1 = [max]
+ pxor m2, m2 ; m2 = sumLuma
+ pxor m3, m3 ; m3 = maxLumaLevel
+ pxor m4, m4 ; m4 = zero
+
+ ; get mask to partial register pixels
+ mov r5d, r2d
+ and r2d, ~(mmsize - 1)
+ sub r5d, r2d
+ lea r6, [pb_movemask_32 + mmsize]
+ sub r6, r5
+ movu m5, [r6] ; m5 = mask for last couple column
+
+.loopH:
+ lea r5d, [r2 - mmsize]
+
+.loopW:
+ movu m6, [r0 + r5]
+ pmaxub m6, m0
+ pminub m6, m1
+ movu [r0 + r5], m6 ; store back
+ pmaxub m3, m6 ; update maxLumaLevel
+ psadbw m6, m4
+ paddq m2, m6
+
+ sub r5d, mmsize
+ jge .loopW
+
+ ; partial pixels
+ movu m7, [r0 + r2]
+ pmaxub m6, m7, m0
+ pminub m6, m1
+
+ pand m7, m5 ; get invalid/unchange pixel
+ pandn m6, m5, m6 ; clear invalid pixels
+ por m7, m6 ; combin valid & invalid pixels
+ movu [r0 + r2], m7 ; store back
+ pmaxub m3, m6 ; update maxLumaLevel
+ psadbw m6, m4
+ paddq m2, m6
+
+.next:
+ add r0, r1
+ dec r3d
+ jg .loopH
+
+ ; sumLuma
+ vextracti128 xm0, m2, 1
+ paddq xm0, xm2
+ movhlps xm1, xm0
+ paddq xm0, xm1
+ movq [r4], xm0
+
+ ; maxLumaLevel
+ vextracti128 xm0, m3, 1
+ pmaxub xm0, xm3
+ movhlps xm3, xm0
+ pmaxub xm0, xm3
+ pmovzxbw xm0, xm0
+ pxor xm0, [pb_movemask + 16]
+ phminposuw xm0, xm0
+
+ movd eax, xm0
+ not al
+ movzx eax, al
+ RET
+%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
More information about the x265-devel
mailing list