[x265] [PATCH] asm: assembly code for IntraPred_DC[16x16]

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Thu Nov 21 12:07:22 CET 2013


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1385030994 -19800
#      Thu Nov 21 16:19:54 2013 +0530
# Node ID 0cc83d3c357a5541bd7c159c4af1d1a3063860ae
# Parent  5768141583e8a6a828bb1837a789b9efd2f0493c
asm: assembly code for IntraPred_DC[16x16]

diff -r 5768141583e8 -r 0cc83d3c357a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Nov 21 15:55:57 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Nov 21 16:19:54 2013 +0530
@@ -657,6 +657,7 @@
         p.quant = x265_quant_sse4;
         p.intra_pred_dc[BLOCK_4x4] = x265_intra_pred_dc4_sse4;
         p.intra_pred_dc[BLOCK_8x8] = x265_intra_pred_dc8_sse4;
+        p.intra_pred_dc[BLOCK_16x16] = x265_intra_pred_dc16_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r 5768141583e8 -r 0cc83d3c357a source/common/x86/intrapred.asm
--- a/source/common/x86/intrapred.asm	Thu Nov 21 15:55:57 2013 +0530
+++ b/source/common/x86/intrapred.asm	Thu Nov 21 16:19:54 2013 +0530
@@ -173,3 +173,119 @@
 
 .end
     RET
+
+;-------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
+;-------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc16, 5, 7, 4, above, left, dst, dstStride, filter
+
+    pxor            m0,            m0
+    movu            m1,            [r0]
+    movu            m2,            [r1]
+    psadbw          m1,            m0
+    psadbw          m2,            m0
+    paddw           m1,            m2
+    pshufd          m2,            m1, 2
+    paddw           m1,            m2
+
+    movd            r5d,           m1
+    add             r5d,           16
+    shr             r5d,           5     ; sum = sum / 32
+    movd            m1,            r5d
+    pshufb          m1,            m0    ; m1 = byte [dc_val ...]
+
+    test            r4d,           r4d
+
+    ; store DC 16x16
+    mov             r6,            r2
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    lea             r2,            [r2 + 2 * r3]
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    lea             r2,            [r2 + 2 * r3]
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    lea             r2,            [r2 + 2 * r3]
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    lea             r2,            [r2 + 2 * r3]
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    lea             r2,            [r2 + 2 * r3]
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    lea             r2,            [r2 + 2 * r3]
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+    lea             r2,            [r2 + 2 * r3]
+    movu            [r2],          m1
+    movu            [r2 + r3],     m1
+
+    ; Do DC Filter
+    jz              .end
+    lea             r4d,           [r5d * 2 + 2]  ; r4d = DC * 2 + 2
+    add             r5d,           r4d            ; r5d = DC * 3 + 2
+    movd            m1,            r5d
+    pshuflw         m1,            m1, 0          ; m1 = pixDCx3
+    pshufd          m1,            m1, 0
+
+    ; filter top
+    pmovzxbw        m2,            [r0]
+    paddw           m2,            m1
+    psraw           m2,            2
+    packuswb        m2,            m2
+    movh            [r6],          m2
+    pmovzxbw        m3,            [r0 + 8]
+    paddw           m3,            m1
+    psraw           m3,            2
+    packuswb        m3,            m3
+    movh            [r6 + 8],      m3
+
+    ; filter top-left
+    movzx           r0d, byte      [r0]
+    add             r4d,           r0d
+    movzx           r0d, byte      [r1]
+    add             r0d,           r4d
+    shr             r0d,           2
+    mov             [r6],          r0b
+
+    ; filter left
+    add             r6,            r3
+    pmovzxbw        m2,            [r1 + 1]
+    paddw           m2,            m1
+    psraw           m2,            2
+    packuswb        m2,            m2
+    pextrb          [r6],          m2, 0
+    pextrb          [r6 + r3],     m2, 1
+    pextrb          [r6 + r3 * 2], m2, 2
+    lea             r6,            [r6 + r3 * 2]
+    add             r6,            r3
+    pextrb          [r6],          m2, 3
+    add             r6,            r3
+    pextrb          [r6],          m2, 4
+    pextrb          [r6 + r3],     m2, 5
+    pextrb          [r6 + r3 * 2], m2, 6
+    lea             r6,            [r6 + r3 * 2]
+    add             r6,            r3
+    pextrb          [r6],          m2, 7
+
+    add             r6,            r3
+    pmovzxbw        m3,            [r1 + 9]
+    paddw           m3,            m1
+    psraw           m3,            2
+    packuswb        m3,            m3
+    pextrb          [r6],          m3, 0
+    pextrb          [r6 + r3],     m3, 1
+    pextrb          [r6 + r3 * 2], m3, 2
+    lea             r6,            [r6 + r3 * 2]
+    add             r6,            r3
+    pextrb          [r6],          m3, 3
+    add             r6,            r3
+    pextrb          [r6],          m3, 4
+    pextrb          [r6 + r3],     m3, 5
+    pextrb          [r6 + r3 * 2], m3, 6
+
+.end
+    RET
diff -r 5768141583e8 -r 0cc83d3c357a source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Thu Nov 21 15:55:57 2013 +0530
+++ b/source/common/x86/intrapred.h	Thu Nov 21 16:19:54 2013 +0530
@@ -28,5 +28,6 @@
 
 void x265_intra_pred_dc4_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter);
 void x265_intra_pred_dc8_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter);
+void x265_intra_pred_dc16_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter);
 
 #endif // ifndef X265_INTRAPRED_H


More information about the x265-devel mailing list