[x265] [PATCH 06 of 29] intra_pred_dc_new: updated asm and unit test code

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Jan 13 08:11:14 CET 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1418718739 -19800
#      Tue Dec 16 14:02:19 2014 +0530
# Node ID 70b4e0c84320df0b7443e5aea6e110c1bf483684
# Parent  f4daa8744d08b569ae652737c4506b397dfb55cb
intra_pred_dc_new: updated asm and unit test code

diff -r f4daa8744d08 -r 70b4e0c84320 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jan 12 12:34:37 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Dec 16 14:02:19 2014 +0530
@@ -1705,6 +1705,11 @@
         p.intra_pred[1][BLOCK_16x16] = x265_intra_pred_dc16_sse4;
         p.intra_pred[1][BLOCK_32x32] = x265_intra_pred_dc32_sse4;
 
+        p.intra_pred_new[1][BLOCK_4x4] = x265_intra_pred_dc4_new_sse4;
+        p.intra_pred_new[1][BLOCK_8x8] = x265_intra_pred_dc8_new_sse4;
+        p.intra_pred_new[1][BLOCK_16x16] = x265_intra_pred_dc16_new_sse4;
+        p.intra_pred_new[1][BLOCK_32x32] = x265_intra_pred_dc32_new_sse4;
+
         INTRA_ANG_SSE4_COMMON(sse4);
         INTRA_ANG_SSE4(sse4);
 
diff -r f4daa8744d08 -r 70b4e0c84320 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Mon Jan 12 12:34:37 2015 +0530
+++ b/source/common/x86/intrapred.h	Tue Dec 16 14:02:19 2014 +0530
@@ -31,6 +31,11 @@
 void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int filter);
 void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int filter);
 
+void x265_intra_pred_dc4_new_sse4 (pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter);
+void x265_intra_pred_dc8_new_sse4(pixel* dst, intptr_t dstStride, pixel* above, int, int filter);
+void x265_intra_pred_dc16_new_sse4(pixel* dst, intptr_t dstStride, pixel* above, int, int filter);
+void x265_intra_pred_dc32_new_sse4(pixel* dst, intptr_t dstStride, pixel* above, int, int filter);
+
 void x265_intra_pred_planar4_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int);
 void x265_intra_pred_planar8_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int);
 void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int);
diff -r f4daa8744d08 -r 70b4e0c84320 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Mon Jan 12 12:34:37 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Tue Dec 16 14:02:19 2014 +0530
@@ -136,6 +136,67 @@
 .end:
     RET
 
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc4_new, 5,5,3
+    inc         r2
+    pxor        m0, m0
+    movd        m1, [r2]
+    movd        m2, [r2 + 8]
+    punpckldq   m1, m2
+    psadbw      m1, m0              ; m1 = sum
+
+    test        r4d, r4d
+
+    mov         r4d, 4096
+    movd        m2, r4d
+    pmulhrsw    m1, m2              ; m1 = (sum + 4) / 8
+    movd        r4d, m1             ; r4d = dc_val
+    pshufb      m1, m0              ; m1 = byte [dc_val ...]
+
+    ; store DC 4x4
+    lea         r3, [r1 * 3]
+    movd        [r0], m1
+    movd        [r0 + r1], m1
+    movd        [r0 + r1 * 2], m1
+    movd        [r0 + r3], m1
+
+    ; do DC filter
+    jz         .end
+    lea         r3d, [r4d * 2 + 2]  ; r3d = DC * 2 + 2
+    add         r4d, r3d            ; r4d = DC * 3 + 2
+    movd        m1, r4d
+    pshuflw     m1, m1, 0           ; m1 = pixDCx3
+
+    ; filter top
+    pmovzxbw    m2, [r2]
+    paddw       m2, m1
+    psraw       m2, 2
+    packuswb    m2, m2
+    movd        [r0], m2            ; overwrite top-left pixel, we will update it later
+
+    ; filter top-left
+    movzx       r4d, byte [r2 + 8]
+    add         r3d, r4d
+    movzx       r4d, byte [r2]
+    add         r4d, r3d
+    shr         r4d, 2
+    mov         [r0], r4b
+
+    ; filter left
+    add         r0, r1
+    pmovzxbw    m2, [r2 + 9]
+    paddw       m2, m1
+    psraw       m2, 2
+    packuswb    m2, m2
+    pextrb      [r0], m2, 0
+    pextrb      [r0 + r1], m2, 1
+    pextrb      [r0 + r1 * 2], m2, 2
+
+.end:
+    RET
 
 ;-------------------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
@@ -217,6 +278,85 @@
 .end:
     RET
 
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc8_new, 5, 7, 3
+    lea             r3, [r2 + 17]
+    inc             r2
+    pxor            m0,            m0
+    movh            m1,            [r2]
+    movh            m2,            [r3]
+    punpcklqdq      m1,            m2
+    psadbw          m1,            m0
+    pshufd          m2,            m1, 2
+    paddw           m1,            m2
+
+    movd            r5d,           m1
+    add             r5d,           8
+    shr             r5d,           4     ; sum = sum / 16
+    movd            m1,            r5d
+    pshufb          m1,            m0    ; m1 = byte [dc_val ...]
+
+    test            r4d,           r4d
+
+    ; store DC 8x8
+    mov             r6,            r0
+    movh            [r0],          m1
+    movh            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movh            [r0],          m1
+    movh            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movh            [r0],          m1
+    movh            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movh            [r0],          m1
+    movh            [r0 + r1],     m1
+
+    ; Do DC Filter
+    jz              .end
+    lea             r4d,           [r5d * 2 + 2]  ; r4d = DC * 2 + 2
+    add             r5d,           r4d            ; r5d = DC * 3 + 2
+    movd            m1,            r5d
+    pshuflw         m1,            m1, 0          ; m1 = pixDCx3
+    pshufd          m1,            m1, 0
+
+    ; filter top
+    pmovzxbw        m2,            [r2]
+    paddw           m2,            m1
+    psraw           m2,            2
+    packuswb        m2,            m2
+    movh            [r6],          m2
+
+    ; filter top-left
+    movzx           r5d, byte      [r3]
+    add             r4d,           r5d
+    movzx           r5d, byte      [r2]
+    add             r5d,           r4d
+    shr             r5d,           2
+    mov             [r6],          r5b
+
+    ; filter left
+    add             r6,            r1
+    pmovzxbw        m2,            [r3 + 1]
+    paddw           m2,            m1
+    psraw           m2,            2
+    packuswb        m2,            m2
+    pextrb          [r6],          m2, 0
+    pextrb          [r6 + r1],     m2, 1
+    pextrb          [r6 + 2 * r1], m2, 2
+    lea             r6,            [r6 + r1 * 2]
+    pextrb          [r6 + r1],     m2, 3
+    pextrb          [r6 + r1 * 2], m2, 4
+    pextrb          [r6 + r1 * 4], m2, 6
+    lea             r1,            [r1 * 3]
+    pextrb          [r6 + r1],     m2, 5
+
+.end:
+    RET
+
 ;-------------------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
 ;-------------------------------------------------------------------------------------------
@@ -332,6 +472,120 @@
 .end:
     RET
 
+;--------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;--------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc16_new, 5, 7, 4
+    lea             r3, [r2 + 33]
+    inc             r2
+    pxor            m0,            m0
+    movu            m1,            [r2]
+    movu            m2,            [r3]
+    psadbw          m1,            m0
+    psadbw          m2,            m0
+    paddw           m1,            m2
+    pshufd          m2,            m1, 2
+    paddw           m1,            m2
+
+    movd            r5d,           m1
+    add             r5d,           16
+    shr             r5d,           5     ; sum = sum / 32
+    movd            m1,            r5d
+    pshufb          m1,            m0    ; m1 = byte [dc_val ...]
+
+    test            r4d,           r4d
+
+    ; store DC 16x16
+    mov             r6,            r0
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    lea             r0,            [r0 + r1 * 2]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+
+    ; Do DC Filter
+    jz              .end
+    lea             r4d,           [r5d * 2 + 2]  ; r4d = DC * 2 + 2
+    add             r5d,           r4d            ; r5d = DC * 3 + 2
+    movd            m1,            r5d
+    pshuflw         m1,            m1, 0          ; m1 = pixDCx3
+    pshufd          m1,            m1, 0
+
+    ; filter top
+    pmovzxbw        m2,            [r2]
+    paddw           m2,            m1
+    psraw           m2,            2
+    packuswb        m2,            m2
+    movh            [r6],          m2
+    pmovzxbw        m3,            [r2 + 8]
+    paddw           m3,            m1
+    psraw           m3,            2
+    packuswb        m3,            m3
+    movh            [r6 + 8],      m3
+
+    ; filter top-left
+    movzx           r5d, byte      [r3]
+    add             r4d,           r5d
+    movzx           r5d, byte      [r2]
+    add             r5d,           r4d
+    shr             r5d,           2
+    mov             [r6],          r5b
+
+    ; filter left
+    add             r6,            r1
+    pmovzxbw        m2,            [r3 + 1]
+    paddw           m2,            m1
+    psraw           m2,            2
+    packuswb        m2,            m2
+    pextrb          [r6],          m2, 0
+    pextrb          [r6 + r1],     m2, 1
+    pextrb          [r6 + r1 * 2], m2, 2
+    lea             r6,            [r6 + r1 * 2]
+    pextrb          [r6 + r1],     m2, 3
+    pextrb          [r6 + r1 * 2], m2, 4
+    lea             r6,            [r6 + r1 * 2]
+    pextrb          [r6 + r1],     m2, 5
+    pextrb          [r6 + r1 * 2], m2, 6
+    lea             r6,            [r6 + r1 * 2]
+    pextrb          [r6 + r1],     m2, 7
+
+    pmovzxbw        m3,            [r3 + 9]
+    paddw           m3,            m1
+    psraw           m3,            2
+    packuswb        m3,            m3
+    pextrb          [r6 + r1 * 2], m3, 0
+    lea             r6,            [r6 + r1 * 2]
+    pextrb          [r6 + r1],     m3, 1
+    pextrb          [r6 + r1 * 2], m3, 2
+    lea             r6,            [r6 + r1 * 2]
+    pextrb          [r6 + r1],     m3, 3
+    pextrb          [r6 + r1 * 2], m3, 4
+    lea             r6,            [r6 + r1 * 2]
+    pextrb          [r6 + r1],     m3, 5
+    pextrb          [r6 + r1 * 2], m3, 6
+
+.end:
+    RET
+
 ;-------------------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
 ;-------------------------------------------------------------------------------------------
@@ -406,6 +660,80 @@
 
     RET
 
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc32_new, 3, 5, 5
+    lea             r3, [r2 + 65]
+    inc             r2
+    pxor            m0,            m0
+    movu            m1,            [r2]
+    movu            m2,            [r2 + 16]
+    movu            m3,            [r3]
+    movu            m4,            [r3 + 16]
+    psadbw          m1,            m0
+    psadbw          m2,            m0
+    psadbw          m3,            m0
+    psadbw          m4,            m0
+    paddw           m1,            m2
+    paddw           m3,            m4
+    paddw           m1,            m3
+    pshufd          m2,            m1, 2
+    paddw           m1,            m2
+
+    movd            r4d,           m1
+    add             r4d,           32
+    shr             r4d,           6     ; sum = sum / 64
+    movd            m1,            r4d
+    pshufb          m1,            m0    ; m1 = byte [dc_val ...]
+
+%rep 2
+    ; store DC 16x16
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    movu            [r0 + 16],     m1
+    movu            [r0 + r1 + 16],m1
+    lea             r0,            [r0 + 2 * r1]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    movu            [r0 + 16],     m1
+    movu            [r0 + r1 + 16],m1
+    lea             r0,            [r0 + 2 * r1]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    movu            [r0 + 16],     m1
+    movu            [r0 + r1 + 16],m1
+    lea             r0,            [r0 + 2 * r1]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    movu            [r0 + 16],     m1
+    movu            [r0 + r1 + 16],m1
+    lea             r0,            [r0 + 2 * r1]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    movu            [r0 + 16],     m1
+    movu            [r0 + r1 + 16],m1
+    lea             r0,            [r0 + 2 * r1]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    movu            [r0 + 16],     m1
+    movu            [r0 + r1 + 16],m1
+    lea             r0,            [r0 + 2 * r1]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    movu            [r0 + 16],     m1
+    movu            [r0 + r1 + 16],m1
+    lea             r0,            [r0 + 2 * r1]
+    movu            [r0],          m1
+    movu            [r0 + r1],     m1
+    movu            [r0 + 16],     m1
+    movu            [r0 + r1 + 16],m1
+    lea             r0,            [r0 + 2 * r1]
+%endrep
+
+    RET
+
 ;-----------------------------------------------------------------------------------------------------------
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
 ;-----------------------------------------------------------------------------------------------------------
diff -r f4daa8744d08 -r 70b4e0c84320 source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp	Mon Jan 12 12:34:37 2015 +0530
+++ b/source/test/intrapredharness.cpp	Tue Dec 16 14:02:19 2014 +0530
@@ -71,6 +71,38 @@
     return true;
 }
 
+bool IntraPredHarness::check_dc_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width)
+{
+    int j = Predict::ADI_BUF_STRIDE;
+    intptr_t stride = FENC_STRIDE;
+
+#if _DEBUG
+    memset(pixel_out_vec, 0xCD, OUTPUT_SIZE);
+    memset(pixel_out_c, 0xCD, OUTPUT_SIZE);
+#endif
+
+    for (int i = 0; i <= 100; i++)
+    {
+        int rand_filter = rand() & 1;
+        if (width > 16)
+            rand_filter = 0;
+
+        ref(pixel_out_c, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, rand_filter);
+        opt(pixel_out_vec, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, rand_filter);
+
+        for (int k = 0; k < width; k++)
+        {
+            if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))
+                return false;
+        }
+
+        reportfail();
+        j += FENC_STRIDE;
+    }
+
+    return true;
+}
+
 bool IntraPredHarness::check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width)
 {
     int j = Predict::ADI_BUF_STRIDE;
@@ -222,6 +254,15 @@
                 return false;
             }
         }
+        if (opt.intra_pred_new[1][i])
+        {
+            const int size = (1 << (i + 2));
+            if (!check_dc_primitive(ref.intra_pred_new[1][i], opt.intra_pred_new[1][i], size))
+            {
+                printf("intra_dc %dx%d failed\n", size, size);
+                return false;
+            }
+        }
     }
 
     // NOTE: always call since this function have check pointer in loop
@@ -279,6 +320,18 @@
             REPORT_SPEEDUP(opt.intra_pred_allangs[i], ref.intra_pred_allangs[i],
                            pixel_out_33_vec, refAbove, refLeft, refAbove, refLeft, bFilter);
         }
+        if (opt.intra_pred_new[1][i])
+        {
+            printf("intra_dc_new_%dx%d[f=0]", size, size);
+            REPORT_SPEEDUP(opt.intra_pred_new[1][i], ref.intra_pred_new[1][i],
+                           pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 0);
+            if (size <= 16)
+            {
+                printf("intra_dc_new_%dx%d[f=1]", size, size);
+                REPORT_SPEEDUP(opt.intra_pred_new[1][i], ref.intra_pred_new[1][i],
+                               pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 1);
+            }
+        }
     }
 
     for (int ii = 2; ii <= 5; ii++)
diff -r f4daa8744d08 -r 70b4e0c84320 source/test/intrapredharness.h
--- a/source/test/intrapredharness.h	Mon Jan 12 12:34:37 2015 +0530
+++ b/source/test/intrapredharness.h	Tue Dec 16 14:02:19 2014 +0530
@@ -42,6 +42,7 @@
     pixel pixel_out_33_vec[OUTPUT_SIZE_33];
 
     bool check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width);
+    bool check_dc_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width);
     bool check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width);
     bool check_angular_primitive(const intra_pred_t ref[][NUM_TR_SIZE], const intra_pred_t opt[][NUM_TR_SIZE]);
     bool check_allangs_primitive(const intra_allangs_t ref[], const intra_allangs_t opt[]);


More information about the x265-devel mailing list