[x265] [PATCH] 16bpp: assembly code for intra_pred_dc32

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Dec 6 10:56:32 CET 2013


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1386323560 -19800
#      Fri Dec 06 15:22:40 2013 +0530
# Node ID 64690c4d2c8cb8faad603cc05b5f437f97746349
# Parent  53c7147e0e34ec247ab0c9f355c11245773e2fcf
16bpp: assembly code for intra_pred_dc32

diff -r 53c7147e0e34 -r 64690c4d2c8c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Dec 06 15:03:57 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Dec 06 15:22:40 2013 +0530
@@ -657,6 +657,7 @@
         p.intra_pred[BLOCK_4x4][1] = x265_intra_pred_dc4_sse4;
         p.intra_pred[BLOCK_8x8][1] = x265_intra_pred_dc8_sse4;
         p.intra_pred[BLOCK_16x16][1] = x265_intra_pred_dc16_sse4;
+        p.intra_pred[BLOCK_32x32][1] = x265_intra_pred_dc32_sse4;
     }
     if (cpuMask & X265_CPU_XOP)
     {
diff -r 53c7147e0e34 -r 64690c4d2c8c source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Fri Dec 06 15:03:57 2013 +0530
+++ b/source/common/x86/intrapred16.asm	Fri Dec 06 15:22:40 2013 +0530
@@ -29,6 +29,7 @@
 SECTION .text
 
 cextern pw_1
+cextern pd_32
 cextern pw_4096
 
 
@@ -317,3 +318,83 @@
 .end
     RET
 
+
+;-------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
+;-------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc32, 4, 5, 6
+    mov             r4d,                 r5m
+    add             r2,                  2
+    add             r3,                  2
+    add             r1,                  r1
+    movu            m0,                  [r3]
+    movu            m1,                  [r3 + 16]
+    movu            m2,                  [r3 + 32]
+    movu            m3,                  [r3 + 48]
+    paddw           m0,                  m1
+    paddw           m2,                  m3
+    paddw           m0,                  m2
+    movu            m1,                  [r2]
+    movu            m3,                  [r2 + 16]
+    movu            m4,                  [r2 + 32]
+    movu            m5,                  [r2 + 48]
+    paddw           m1,                  m3
+    paddw           m4,                  m5
+    paddw           m1,                  m4
+    paddw           m0,                  m1
+    movhlps         m1,                  m0
+    paddw           m0,                  m1
+    phaddw          m0,                  m0
+    pmaddwd         m0,                  [pw_1]
+
+    paddd           m0,                  [pd_32]     ; sum = sum + 32
+    psrld           m0,                  6           ; sum = sum / 64
+    pshuflw         m0,                  m0, 0
+    pshufd          m0,                  m0, 0
+
+%rep 4
+    ; store DC 16x16
+    movu            [r0 +  0],           m0
+    movu            [r0 + 16],           m0
+    movu            [r0 + 32],           m0
+    movu            [r0 + 48],           m0
+    add             r0,                  r1
+    movu            [r0 +  0],           m0
+    movu            [r0 + 16],           m0
+    movu            [r0 + 32],           m0
+    movu            [r0 + 48],           m0
+    add             r0,                  r1
+    movu            [r0 +  0],           m0
+    movu            [r0 + 16],           m0
+    movu            [r0 + 32],           m0
+    movu            [r0 + 48],           m0
+    add             r0,                  r1
+    movu            [r0 +  0],           m0
+    movu            [r0 + 16],           m0
+    movu            [r0 + 32],           m0
+    movu            [r0 + 48],           m0
+    add             r0,                  r1
+    movu            [r0 +  0],           m0
+    movu            [r0 + 16],           m0
+    movu            [r0 + 32],           m0
+    movu            [r0 + 48],           m0
+    add             r0,                  r1
+    movu            [r0 +  0],           m0
+    movu            [r0 + 16],           m0
+    movu            [r0 + 32],           m0
+    movu            [r0 + 48],           m0
+    add             r0,                  r1
+    movu            [r0 +  0],           m0
+    movu            [r0 + 16],           m0
+    movu            [r0 + 32],           m0
+    movu            [r0 + 48],           m0
+    add             r0,                  r1
+    movu            [r0 +  0],           m0
+    movu            [r0 + 16],           m0
+    movu            [r0 + 32],           m0
+    movu            [r0 + 48],           m0
+    add             r0,                  r1
+%endrep
+
+    RET


More information about the x265-devel mailing list