[x265] [PATCH 06 of 29] intra_pred_dc_new: updated asm and unit test code

chen chenm003 at 163.com
Tue Jan 13 09:29:18 CET 2015



At 2015-01-13 15:11:14,dnyaneshwar at multicorewareinc.com wrote:
># HG changeset patch
># User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
># Date 1418718739 -19800
>#      Tue Dec 16 14:02:19 2014 +0530
># Node ID 70b4e0c84320df0b7443e5aea6e110c1bf483684
># Parent  f4daa8744d08b569ae652737c4506b397dfb55cb
>intra_pred_dc_new: updated asm and unit test code
>
>+;---------------------------------------------------------------------------------------------
>+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
>+;---------------------------------------------------------------------------------------------
>+INIT_XMM sse4
>+cglobal intra_pred_dc4_new, 5,5,3
>+    inc         r2
>+    pxor        m0, m0
>+    movd        m1, [r2]
>+    movd        m2, [r2 + 8]
>+    punpckldq   m1, m2
>+    psadbw      m1, m0              ; m1 = sum
>+
>+    test        r4d, r4d
>+
>+    mov         r4d, 4096
>+    movd        m2, r4d
>+    pmulhrsw    m1, m2              ; m1 = (sum + 4) / 8
there have a constant pw_4096

>+    movd        r4d, m1             ; r4d = dc_val
>+    pshufb      m1, m0              ; m1 = byte [dc_val ...]
>+
>+    ; store DC 4x4
>+    lea         r3, [r1 * 3]
>+    movd        [r0], m1
>+    movd        [r0 + r1], m1
>+    movd        [r0 + r1 * 2], m1
>+    movd        [r0 + r3], m1
>+
>+    ; do DC filter
>+    jz         .end
>+    lea         r3d, [r4d * 2 + 2]  ; r3d = DC * 2 + 2
>+    add         r4d, r3d            ; r4d = DC * 3 + 2
>+    movd        m1, r4d
>+    pshuflw     m1, m1, 0           ; m1 = pixDCx3
we have more free register, so we can keep dc in register to reduce operators
 
>+
>+    ; filter top
>+    pmovzxbw    m2, [r2]
>+    paddw       m2, m1
>+    psraw       m2, 2
combo above modify, we can make (x + 2)>>2 with pmulhrsw

>+    packuswb    m2, m2
>+    movd        [r0], m2            ; overwrite top-left pixel, we will update it later
>+
>+    ; filter top-left
>+    movzx       r4d, byte [r2 + 8]
>+    add         r3d, r4d
>+    movzx       r4d, byte [r2]
>+    add         r4d, r3d
>+    shr         r4d, 2
>+    mov         [r0], r4b
>+
>+    ; filter left
>+    add         r0, r1
>+    pmovzxbw    m2, [r2 + 9]
>+    paddw       m2, m1
>+    psraw       m2, 2
>+    packuswb    m2, m2
>+    pextrb      [r0], m2, 0
>+    pextrb      [r0 + r1], m2, 1
>+    pextrb      [r0 + r1 * 2], m2, 2
>+
>+.end:
>+    RET
> 
> ;-------------------------------------------------------------------------------------------
> ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
>@@ -217,6 +278,85 @@
> .end:
>     RET
> 
>+;---------------------------------------------------------------------------------------------
>+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
>+;---------------------------------------------------------------------------------------------
>+INIT_XMM sse4
>+cglobal intra_pred_dc8_new, 5, 7, 3
>+    lea             r3, [r2 + 17]
>+    inc             r2
>+    pxor            m0,            m0
>+    movh            m1,            [r2]
>+    movh            m2,            [r3]
>+    punpcklqdq      m1,            m2
how about movhps?

>+    psadbw          m1,            m0
>+    pshufd          m2,            m1, 2
>+    paddw           m1,            m2
paddd ?

>+
>+    movd            r5d,           m1
>+    add             r5d,           8
>+    shr             r5d,           4     ; sum = sum / 16
same as 4x4, we can reduce convert between general and SSE register

>+    movd            m1,            r5d
>+    pshufb          m1,            m0    ; m1 = byte [dc_val ...]
>+
>+    test            r4d,           r4d
>+
>+    ; store DC 8x8
>+    mov             r6,            r0
>+    movh            [r0],          m1
>+    movh            [r0 + r1],     m1
>+    lea             r0,            [r0 + r1 * 2]
>+    movh            [r0],          m1
>+    movh            [r0 + r1],     m1
>+    lea             r0,            [r0 + r1 * 2]
>+    movh            [r0],          m1
>+    movh            [r0 + r1],     m1
>+    lea             r0,            [r0 + r1 * 2]
>+    movh            [r0],          m1
>+    movh            [r0 + r1],     m1
>+
>+    ; Do DC Filter
>+    jz              .end
>+    lea             r4d,           [r5d * 2 + 2]  ; r4d = DC * 2 + 2
>+    add             r5d,           r4d            ; r5d = DC * 3 + 2
>+    movd            m1,            r5d
>+    pshuflw         m1,            m1, 0          ; m1 = pixDCx3
>+    pshufd          m1,            m1, 0
>+
>+    ; filter top
>+    pmovzxbw        m2,            [r2]
>+    paddw           m2,            m1
>+    psraw           m2,            2
>+    packuswb        m2,            m2
>+    movh            [r6],          m2
>+
>+    ; filter top-left
>+    movzx           r5d, byte      [r3]
>+    add             r4d,           r5d
>+    movzx           r5d, byte      [r2]
>+    add             r5d,           r4d
>+    shr             r5d,           2
>+    mov             [r6],          r5b
>+
>+    ; filter left
>+    add             r6,            r1
>+    pmovzxbw        m2,            [r3 + 1]
>+    paddw           m2,            m1
>+    psraw           m2,            2
>+    packuswb        m2,            m2
>+    pextrb          [r6],          m2, 0
>+    pextrb          [r6 + r1],     m2, 1
>+    pextrb          [r6 + 2 * r1], m2, 2
>+    lea             r6,            [r6 + r1 * 2]
>+    pextrb          [r6 + r1],     m2, 3
>+    pextrb          [r6 + r1 * 2], m2, 4
>+    pextrb          [r6 + r1 * 4], m2, 6
>+    lea             r1,            [r1 * 3]
how about reuse r0 or others to reduce above lea_r6

>+    pextrb          [r6 + r1],     m2, 5
>+
>+.end:
>+    RET
>+
> ;-------------------------------------------------------------------------------------------
> ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
> ;-------------------------------------------------------------------------------------------
>@@ -332,6 +472,120 @@
> .end:
>     RET
> 
>+;--------------------------------------------------------------------------------------------
>+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
>+;--------------------------------------------------------------------------------------------
>+INIT_XMM sse4
>+cglobal intra_pred_dc16_new, 5, 7, 4
>+    lea             r3, [r2 + 33]
>+    inc             r2
>+    pxor            m0,            m0
>+    movu            m1,            [r2]
>+    movu            m2,            [r3]
>+    psadbw          m1,            m0
>+    psadbw          m2,            m0
>+    paddw           m1,            m2
>+    pshufd          m2,            m1, 2
>+    paddw           m1,            m2
>+
>+    movd            r5d,           m1
>+    add             r5d,           16
>+    shr             r5d,           5     ; sum = sum / 32
>+    movd            m1,            r5d
>+    pshufb          m1,            m0    ; m1 = byte [dc_val ...]
>+
>+    test            r4d,           r4d
>+
>+    ; store DC 16x16
>+    mov             r6,            r0
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    lea             r0,            [r0 + r1 * 2]
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    lea             r0,            [r0 + r1 * 2]
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    lea             r0,            [r0 + r1 * 2]
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    lea             r0,            [r0 + r1 * 2]
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    lea             r0,            [r0 + r1 * 2]
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    lea             r0,            [r0 + r1 * 2]
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    lea             r0,            [r0 + r1 * 2]
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+
>+    ; Do DC Filter
>+    jz              .end
>+    lea             r4d,           [r5d * 2 + 2]  ; r4d = DC * 2 + 2
>+    add             r5d,           r4d            ; r5d = DC * 3 + 2
>+    movd            m1,            r5d
>+    pshuflw         m1,            m1, 0          ; m1 = pixDCx3
>+    pshufd          m1,            m1, 0
>+
>+    ; filter top
>+    pmovzxbw        m2,            [r2]
>+    paddw           m2,            m1
>+    psraw           m2,            2
>+    packuswb        m2,            m2
>+    movh            [r6],          m2
>+    pmovzxbw        m3,            [r2 + 8]
>+    paddw           m3,            m1
>+    psraw           m3,            2
>+    packuswb        m3,            m3
>+    movh            [r6 + 8],      m3
>+
>+    ; filter top-left
>+    movzx           r5d, byte      [r3]
>+    add             r4d,           r5d
>+    movzx           r5d, byte      [r2]
>+    add             r5d,           r4d
>+    shr             r5d,           2
>+    mov             [r6],          r5b
>+
>+    ; filter left
>+    add             r6,            r1
>+    pmovzxbw        m2,            [r3 + 1]
>+    paddw           m2,            m1
>+    psraw           m2,            2
>+    packuswb        m2,            m2
>+    pextrb          [r6],          m2, 0
>+    pextrb          [r6 + r1],     m2, 1
>+    pextrb          [r6 + r1 * 2], m2, 2
>+    lea             r6,            [r6 + r1 * 2]
>+    pextrb          [r6 + r1],     m2, 3
>+    pextrb          [r6 + r1 * 2], m2, 4
>+    lea             r6,            [r6 + r1 * 2]
>+    pextrb          [r6 + r1],     m2, 5
>+    pextrb          [r6 + r1 * 2], m2, 6
>+    lea             r6,            [r6 + r1 * 2]
>+    pextrb          [r6 + r1],     m2, 7
>+
>+    pmovzxbw        m3,            [r3 + 9]
>+    paddw           m3,            m1
>+    psraw           m3,            2
>+    packuswb        m3,            m3
>+    pextrb          [r6 + r1 * 2], m3, 0
>+    lea             r6,            [r6 + r1 * 2]
>+    pextrb          [r6 + r1],     m3, 1
>+    pextrb          [r6 + r1 * 2], m3, 2
>+    lea             r6,            [r6 + r1 * 2]
>+    pextrb          [r6 + r1],     m3, 3
>+    pextrb          [r6 + r1 * 2], m3, 4
>+    lea             r6,            [r6 + r1 * 2]
>+    pextrb          [r6 + r1],     m3, 5
>+    pextrb          [r6 + r1 * 2], m3, 6
>+
>+.end:
>+    RET
>+
> ;-------------------------------------------------------------------------------------------
> ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
> ;-------------------------------------------------------------------------------------------
>@@ -406,6 +660,80 @@
> 
>     RET
> 
>+;---------------------------------------------------------------------------------------------
>+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
>+;---------------------------------------------------------------------------------------------
>+INIT_XMM sse4
>+cglobal intra_pred_dc32_new, 3, 5, 5
>+    lea             r3, [r2 + 65]
>+    inc             r2
>+    pxor            m0,            m0
>+    movu            m1,            [r2]
>+    movu            m2,            [r2 + 16]
>+    movu            m3,            [r3]
>+    movu            m4,            [r3 + 16]
>+    psadbw          m1,            m0
>+    psadbw          m2,            m0
>+    psadbw          m3,            m0
>+    psadbw          m4,            m0
>+    paddw           m1,            m2
>+    paddw           m3,            m4
>+    paddw           m1,            m3
>+    pshufd          m2,            m1, 2
>+    paddw           m1,            m2
>+
>+    movd            r4d,           m1
>+    add             r4d,           32
>+    shr             r4d,           6     ; sum = sum / 64
>+    movd            m1,            r4d
>+    pshufb          m1,            m0    ; m1 = byte [dc_val ...]
>+
>+%rep 2
>+    ; store DC 16x16
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    movu            [r0 + 16],     m1
>+    movu            [r0 + r1 + 16],m1
>+    lea             r0,            [r0 + 2 * r1]
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    movu            [r0 + 16],     m1
>+    movu            [r0 + r1 + 16],m1
>+    lea             r0,            [r0 + 2 * r1]
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    movu            [r0 + 16],     m1
>+    movu            [r0 + r1 + 16],m1
>+    lea             r0,            [r0 + 2 * r1]
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    movu            [r0 + 16],     m1
>+    movu            [r0 + r1 + 16],m1
>+    lea             r0,            [r0 + 2 * r1]
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    movu            [r0 + 16],     m1
>+    movu            [r0 + r1 + 16],m1
>+    lea             r0,            [r0 + 2 * r1]
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    movu            [r0 + 16],     m1
>+    movu            [r0 + r1 + 16],m1
>+    lea             r0,            [r0 + 2 * r1]
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    movu            [r0 + 16],     m1
>+    movu            [r0 + r1 + 16],m1
>+    lea             r0,            [r0 + 2 * r1]
>+    movu            [r0],          m1
>+    movu            [r0 + r1],     m1
>+    movu            [r0 + 16],     m1
>+    movu            [r0 + r1 + 16],m1
>+    lea             r0,            [r0 + 2 * r1]
>+%endrep
>+
>+    RET
>+
> ;-----------------------------------------------------------------------------------------------------------
> ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
> ;-----------------------------------------------------------------------------------------------------------
>diff -r f4daa8744d08 -r 70b4e0c84320 source/test/intrapredharness.cpp
>--- a/source/test/intrapredharness.cpp Mon Jan 12 12:34:37 2015 +0530
>+++ b/source/test/intrapredharness.cpp Tue Dec 16 14:02:19 2014 +0530
>@@ -71,6 +71,38 @@
>     return true;
> }
> 
>+bool IntraPredHarness::check_dc_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width)
>+{
>+    int j = Predict::ADI_BUF_STRIDE;
>+    intptr_t stride = FENC_STRIDE;
>+
>+#if _DEBUG
>+    memset(pixel_out_vec, 0xCD, OUTPUT_SIZE);
>+    memset(pixel_out_c, 0xCD, OUTPUT_SIZE);
>+#endif
>+
>+    for (int i = 0; i <= 100; i++)
>+    {
>+        int rand_filter = rand() & 1;
>+        if (width > 16)
>+            rand_filter = 0;
>+
>+        ref(pixel_out_c, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, rand_filter);
>+        opt(pixel_out_vec, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, rand_filter);
>+
>+        for (int k = 0; k < width; k++)
>+        {
>+            if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))
>+                return false;
>+        }
>+
>+        reportfail();
>+        j += FENC_STRIDE;
>+    }
>+
>+    return true;
>+}
>+
> bool IntraPredHarness::check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width)
> {
>     int j = Predict::ADI_BUF_STRIDE;
>@@ -222,6 +254,15 @@
>                 return false;
>             }
>         }
>+        if (opt.intra_pred_new[1][i])
>+        {
>+            const int size = (1 << (i + 2));
>+            if (!check_dc_primitive(ref.intra_pred_new[1][i], opt.intra_pred_new[1][i], size))
>+            {
>+                printf("intra_dc %dx%d failed\n", size, size);
>+                return false;
>+            }
>+        }
>     }
> 
>     // NOTE: always call since this function have check pointer in loop
>@@ -279,6 +320,18 @@
>             REPORT_SPEEDUP(opt.intra_pred_allangs[i], ref.intra_pred_allangs[i],
>                            pixel_out_33_vec, refAbove, refLeft, refAbove, refLeft, bFilter);
>         }
>+        if (opt.intra_pred_new[1][i])
>+        {
>+            printf("intra_dc_new_%dx%d[f=0]", size, size);
>+            REPORT_SPEEDUP(opt.intra_pred_new[1][i], ref.intra_pred_new[1][i],
>+                           pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 0);
>+            if (size <= 16)
>+            {
>+                printf("intra_dc_new_%dx%d[f=1]", size, size);
>+                REPORT_SPEEDUP(opt.intra_pred_new[1][i], ref.intra_pred_new[1][i],
>+                               pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 1);
>+            }
>+        }
>     }
> 
>     for (int ii = 2; ii <= 5; ii++)
>diff -r f4daa8744d08 -r 70b4e0c84320 source/test/intrapredharness.h
>--- a/source/test/intrapredharness.h Mon Jan 12 12:34:37 2015 +0530
>+++ b/source/test/intrapredharness.h Tue Dec 16 14:02:19 2014 +0530
>@@ -42,6 +42,7 @@
>     pixel pixel_out_33_vec[OUTPUT_SIZE_33];
> 
>     bool check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width);
>+    bool check_dc_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width);
>     bool check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width);
>     bool check_angular_primitive(const intra_pred_t ref[][NUM_TR_SIZE], const intra_pred_t opt[][NUM_TR_SIZE]);
>     bool check_allangs_primitive(const intra_allangs_t ref[], const intra_allangs_t opt[]);
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150113/979e7621/attachment-0001.html>


More information about the x265-devel mailing list