[x265] [PATCH] asm: new primivite planeClipAndMax for Clip Luma to custom range and statistics MaxLumaLeve

Min Chen chenm003 at 163.com
Sat Aug 15 00:07:49 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1439590062 25200
# Node ID a10af57003fc3044d0d4c000290faeda19a81777
# Parent  d56b2466c04459205287e1581d8a36eebf372ba6
asm: new primivite planeClipAndMax for Clip Luma to custom range and statistics MaxLumaLeve
---
 source/common/picyuv.cpp      |   16 ++------
 source/common/pixel.cpp       |   26 +++++++++++++
 source/common/primitives.h    |    2 +
 source/common/x86/const-a.asm |    5 +++
 source/common/x86/pixel-a.asm |   78 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 115 insertions(+), 12 deletions(-)

diff -r d56b2466c044 -r a10af57003fc source/common/picyuv.cpp
--- a/source/common/picyuv.cpp	Wed Aug 12 18:12:20 2015 +0530
+++ b/source/common/picyuv.cpp	Fri Aug 14 15:07:42 2015 -0700
@@ -237,25 +237,17 @@
     pixel *U = m_picOrg[1];
     pixel *V = m_picOrg[2];
 
-    uint64_t sumLuma = 0;
+    uint64_t sumLuma;
+    m_maxLumaLevel = primitives.planeClipAndMax(Y, m_stride, width, height, &sumLuma, (pixel)param.minLuma, (pixel)param.maxLuma);
+    m_avgLumaLevel = (double)(sumLuma) / (m_picHeight * m_picWidth);
+
     for (int r = 0; r < height; r++)
     {
-        for (int c = 0; c < width; c++)
-        {
-            /* Clip luma of source picture to max and min values before extending edges of picYuv */
-            Y[c] = x265_clip3((pixel)param.minLuma, (pixel)param.maxLuma, Y[c]);
-
-            /* Determine maximum and average luma level in a picture */
-            m_maxLumaLevel = X265_MAX(Y[c], m_maxLumaLevel);
-            sumLuma += Y[c];
-        }
-
         for (int x = 0; x < padx; x++)
             Y[width + x] = Y[width - 1];
 
         Y += m_stride;
     }
-    m_avgLumaLevel = (double)(sumLuma) / (m_picHeight * m_picWidth);
 
     for (int r = 0; r < height >> m_vChromaShift; r++)
     {
diff -r d56b2466c044 -r a10af57003fc source/common/pixel.cpp
--- a/source/common/pixel.cpp	Wed Aug 12 18:12:20 2015 +0530
+++ b/source/common/pixel.cpp	Fri Aug 14 15:07:42 2015 -0700
@@ -973,6 +973,31 @@
         dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
     }
 }
+
+static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix)
+{
+    pixel maxLumaLevel = 0;
+    uint64_t sumLuma = 0;
+
+    for (int r = 0; r < height; r++)
+    {
+        for (int c = 0; c < width; c++)
+        {
+            /* Clip luma of source picture to max and min values before extending edges of picYuv */
+            src[c] = x265_clip3((pixel)minPix, (pixel)maxPix, src[c]);
+
+            /* Determine maximum and average luma level in a picture */
+            maxLumaLevel = X265_MAX(src[c], maxLumaLevel);
+            sumLuma += src[c];
+        }
+
+        src += stride;
+    }
+
+    *outsum = sumLuma;
+    return maxLumaLevel;
+}
+
 }  // end anonymous namespace
 
 namespace X265_NS {
@@ -1258,6 +1283,7 @@
     p.planecopy_cp = planecopy_cp_c;
     p.planecopy_sp = planecopy_sp_c;
     p.planecopy_sp_shl = planecopy_sp_shl_c;
+    p.planeClipAndMax = planeClipAndMax_c;
     p.propagateCost = estimateCUPropagateCost;
 }
 }
diff -r d56b2466c044 -r a10af57003fc source/common/primitives.h
--- a/source/common/primitives.h	Wed Aug 12 18:12:20 2015 +0530
+++ b/source/common/primitives.h	Fri Aug 14 15:07:42 2015 -0700
@@ -185,6 +185,7 @@
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+typedef pixel (*planeClipAndMax_t)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
 
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
 
@@ -316,6 +317,7 @@
     planecopy_cp_t        planecopy_cp;
     planecopy_sp_t        planecopy_sp;
     planecopy_sp_t        planecopy_sp_shl;
+    planeClipAndMax_t     planeClipAndMax;
 
     weightp_sp_t          weight_sp;
     weightp_pp_t          weight_pp;
diff -r d56b2466c044 -r a10af57003fc source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Wed Aug 12 18:12:20 2015 +0530
+++ b/source/common/x86/const-a.asm	Fri Aug 14 15:07:42 2015 -0700
@@ -54,6 +54,11 @@
 const pb_shuf8x8c,          times  1 db   0,   0,   0,   0,   2,   2,   2,   2,   4,   4,   4,   4,   6,   6,   6,   6
 const pb_movemask,          times 16 db 0x00
                             times 16 db 0xFF
+
+const pb_movemask_32,       times 32 db 0x00
+                            times 32 db 0xFF
+                            times 32 db 0x00
+
 const pb_0000000000000F0F,  times  2 db 0xff, 0x00
                             times 12 db 0x00
 const pb_000000000000000F,           db 0xff
diff -r d56b2466c044 -r a10af57003fc source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Wed Aug 12 18:12:20 2015 +0530
+++ b/source/common/x86/pixel-a.asm	Fri Aug 14 15:07:42 2015 -0700
@@ -70,6 +70,7 @@
 cextern pd_2
 cextern hmul_16p
 cextern pb_movemask
+cextern pb_movemask_32
 cextern pw_pixel_max
 
 ;=============================================================================
@@ -12493,3 +12494,80 @@
     movd            eax, xm6
     RET
 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
+
+
+;-------------------------------------------------------------------------------------------------------------------------------------
+; pixel planeClipAndMax(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix)
+;-------------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
+INIT_YMM avx2
+cglobal planeClipAndMax, 5,7,8
+    movd            xm0, r5m
+    vpbroadcastb    m0, xm0                 ; m0 = [min]
+    vpbroadcastb    m1, r6m                 ; m1 = [max]
+    pxor            m2, m2                  ; m2 = sumLuma
+    pxor            m3, m3                  ; m3 = maxLumaLevel
+    pxor            m4, m4                  ; m4 = zero
+
+    ; get mask to partial register pixels
+    mov             r5d, r2d
+    and             r2d, ~(mmsize - 1)
+    sub             r5d, r2d
+    lea             r6, [pb_movemask_32 + mmsize]
+    sub             r6, r5
+    movu            m5, [r6]                ; m5 = mask for last couple column
+
+.loopH:
+    lea             r5d, [r2 - mmsize]
+
+.loopW:
+    movu            m6, [r0 + r5]
+    pmaxub          m6, m0
+    pminub          m6, m1
+    movu            [r0 + r5], m6           ; store back
+    pmaxub          m3, m6                  ; update maxLumaLevel
+    psadbw          m6, m4
+    paddq           m2, m6
+
+    sub             r5d, mmsize
+    jge            .loopW
+
+    ; partial pixels
+    movu            m7, [r0 + r2]
+    pmaxub          m6, m7, m0
+    pminub          m6, m1
+
+    pand            m7, m5                  ; get invalid/unchange pixel
+    pandn           m6, m5, m6              ; clear invalid pixels
+    por             m7, m6                  ; combin valid & invalid pixels
+    movu            [r0 + r2], m7           ; store back
+    pmaxub          m3, m6                  ; update maxLumaLevel
+    psadbw          m6, m4
+    paddq           m2, m6
+
+.next:
+    add             r0, r1
+    dec             r3d
+    jg             .loopH
+
+    ; sumLuma
+    vextracti128    xm0, m2, 1
+    paddq           xm0, xm2
+    movhlps         xm1, xm0
+    paddq           xm0, xm1
+    movq            [r4], xm0
+
+    ; maxLumaLevel
+    vextracti128    xm0, m3, 1
+    pmaxub          xm0, xm3
+    movhlps         xm3, xm0
+    pmaxub          xm0, xm3
+    pmovzxbw        xm0, xm0
+    pxor            xm0, [pb_movemask + 16]
+    phminposuw      xm0, xm0
+
+    movd            eax, xm0
+    not             al
+    movzx           eax, al
+    RET
+%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0



More information about the x265-devel mailing list