[x265] [PATCH] asm: assembly code for IntraPred_DC[32x32], Disabled DC filter for cuSize > 16 in testbench

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Thu Nov 21 12:13:59 CET 2013


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1385032379 -19800
#      Thu Nov 21 16:42:59 2013 +0530
# Node ID 2a7d0c62bff43cd72ef6162a8197b3046e6c175c
# Parent  0cc83d3c357a5541bd7c159c4af1d1a3063860ae
asm: assembly code for IntraPred_DC[32x32], Disabled DC filter for cuSize > 16 in testbench

diff -r 0cc83d3c357a -r 2a7d0c62bff4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Nov 21 16:19:54 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Nov 21 16:42:59 2013 +0530
@@ -658,6 +658,7 @@
         p.intra_pred_dc[BLOCK_4x4] = x265_intra_pred_dc4_sse4;
         p.intra_pred_dc[BLOCK_8x8] = x265_intra_pred_dc8_sse4;
         p.intra_pred_dc[BLOCK_16x16] = x265_intra_pred_dc16_sse4;
+        p.intra_pred_dc[BLOCK_32x32] = x265_intra_pred_dc32_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r 0cc83d3c357a -r 2a7d0c62bff4 source/common/x86/intrapred.asm
--- a/source/common/x86/intrapred.asm	Thu Nov 21 16:19:54 2013 +0530
+++ b/source/common/x86/intrapred.asm	Thu Nov 21 16:42:59 2013 +0530
@@ -289,3 +289,76 @@
 
 .end
     RET
+
+;-------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
+;-------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc32, 4, 5, 3, above, left, dst, dstStride, filter
+
+    pxor            m0,            m0
+    movu            m1,            [r0]
+    movu            m2,            [r0 + 16]
+    movu            m3,            [r1]
+    movu            m4,            [r1 + 16]
+    psadbw          m1,            m0
+    psadbw          m2,            m0
+    psadbw          m3,            m0
+    psadbw          m4,            m0
+    paddw           m1,            m2
+    paddw           m3,            m4
+    paddw           m1,            m3
+    pshufd          m2,            m1, 2
+    paddw           m1,            m2
+
+    movd            r4d,           m1
+    add             r4d,           32
+    shr             r4d,           6     ; sum = sum / 64
+    movd            m1,            r4d
+    pshufb          m1,            m0    ; m1 = byte [dc_val ...]
+
+%rep 2
+    ; store DC 16x16
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    movu            [r2 + 16],     m1
+    movu            [r2 + r3 + 16],m1
+    lea             r2,            [r2 + 2 * r3]
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    movu            [r2 + 16],     m1
+    movu            [r2 + r3 + 16],m1
+    lea             r2,            [r2 + 2 * r3]
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    movu            [r2 + 16],     m1
+    movu            [r2 + r3 + 16],m1
+    lea             r2,            [r2 + 2 * r3]
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    movu            [r2 + 16],     m1
+    movu            [r2 + r3 + 16],m1
+    lea             r2,            [r2 + 2 * r3]
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    movu            [r2 + 16],     m1
+    movu            [r2 + r3 + 16],m1
+    lea             r2,            [r2 + 2 * r3]
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    movu            [r2 + 16],     m1
+    movu            [r2 + r3 + 16],m1
+    lea             r2,            [r2 + 2 * r3]
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    movu            [r2 + 16],     m1
+    movu            [r2 + r3 + 16],m1
+    lea             r2,            [r2 + 2 * r3]
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    movu            [r2 + 16],     m1
+    movu            [r2 + r3 + 16],m1
+    lea             r2,            [r2 + 2 * r3]
+%endrep
+
+    RET
diff -r 0cc83d3c357a -r 2a7d0c62bff4 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Thu Nov 21 16:19:54 2013 +0530
+++ b/source/common/x86/intrapred.h	Thu Nov 21 16:42:59 2013 +0530
@@ -29,5 +29,6 @@
 void x265_intra_pred_dc4_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter);
 void x265_intra_pred_dc8_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter);
 void x265_intra_pred_dc16_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter);
+void x265_intra_pred_dc32_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter);
 
 #endif // ifndef X265_INTRAPRED_H
diff -r 0cc83d3c357a -r 2a7d0c62bff4 source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp	Thu Nov 21 16:19:54 2013 +0530
+++ b/source/test/intrapredharness.cpp	Thu Nov 21 16:42:59 2013 +0530
@@ -75,6 +75,8 @@
     for (int i = 0; i <= 100; i++)
     {
         int rand_filter = rand() & 1;
+        if (width > 16)
+            rand_filter = 0;
 
         pixel left[MAX_CU_SIZE * 2 + 1];
         for (int k = 0; k < width * 2 + 1; k++)
@@ -296,9 +298,12 @@
             printf("intra_dc_%dx%d[filter=0]", size, size);
             REPORT_SPEEDUP(opt.intra_pred_dc[i], ref.intra_pred_dc[i],
                            pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, 0);
-            printf("intra_dc_%dx%d[filter=1]", size, size);
-            REPORT_SPEEDUP(opt.intra_pred_dc[i], ref.intra_pred_dc[i],
+            if (size <= 16)
+            {
+                printf("intra_dc_%dx%d[filter=1]", size, size);
+                REPORT_SPEEDUP(opt.intra_pred_dc[i], ref.intra_pred_dc[i],
                            pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, 1);
+            }
         }
     }
     if (opt.intra_pred_planar)


More information about the x265-devel mailing list