[x265] [PATCH] asm: fix intrapred_planar32x32 sse2 code for main12

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Tue Nov 17 13:49:25 CET 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1447764366 -19800
#      Tue Nov 17 18:16:06 2015 +0530
# Node ID 069c502d4957f36bb5887158b13dfe94d4e0f737
# Parent  e8f9a60d4cd9e73c9f2baf05c2ccda5af1892b46
asm: fix intrapred_planar32x32 sse2 code for main12

     intra_planar_32x32      7.13x    6106.53         43568.75

diff -r e8f9a60d4cd9 -r 069c502d4957 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Nov 16 16:44:33 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Nov 17 18:16:06 2015 +0530
@@ -963,10 +963,7 @@
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse2);
         p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse2);
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse2);
-
-#if X265_DEPTH <= 10
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse2);
-#endif /* X265_DEPTH <= 10 */
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
 
         p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
diff -r e8f9a60d4cd9 -r 069c502d4957 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Mon Nov 16 16:44:33 2015 +0530
+++ b/source/common/x86/intrapred16.asm	Tue Nov 17 18:16:06 2015 +0530
@@ -933,6 +933,178 @@
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
 ;---------------------------------------------------------------------------------------
 INIT_XMM sse2
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+cglobal intra_pred_planar32, 3,7,16
+    ; NOTE: align stack to 64 bytes, so all of local data in same cache line
+    mov             r6, rsp
+    sub             rsp, 4*mmsize
+    and             rsp, ~63
+    %define         m16 [rsp + 0 * mmsize]
+    %define         m17 [rsp + 1 * mmsize]
+    %define         m18 [rsp + 2 * mmsize]
+    %define         m19 [rsp + 3 * mmsize]
+
+    add             r1, r1
+    pxor            m12, m12
+
+    movzx           r3d, word [r2 + 66]
+    lea             r4, [planar32_table1]
+
+    movd            m0, r3d
+    pshufd          m0, m0, 0
+
+    pmaddwd         m8, m0, [r4 + 0]
+    pmaddwd         m9, m0, [r4 + 16]
+    pmaddwd         m10, m0, [r4 + 32]
+    pmaddwd         m11, m0, [r4 + 48]
+    pmaddwd         m7, m0, [r4 + 64]
+    pmaddwd         m13, m0, [r4 + 80]
+    pmaddwd         m14, m0, [r4 + 96]
+    pmaddwd         m15, m0, [r4 + 112]
+
+    movzx           r3d, word [r2 + 194]
+    movd            m0, r3d
+    pshufd          m0, m0, 0
+
+    paddd           m8, m0
+    paddd           m9, m0
+    paddd           m10, m0
+    paddd           m11, m0
+    paddd           m7, m0
+    paddd           m13, m0
+    paddd           m14, m0
+    paddd           m15, m0
+
+    paddd           m8, [pd_32]
+    paddd           m9, [pd_32]
+    paddd           m10, [pd_32]
+    paddd           m11, [pd_32]
+    paddd           m7, [pd_32]
+    paddd           m13, [pd_32]
+    paddd           m14, [pd_32]
+    paddd           m15, [pd_32]
+
+    movu            m1, [r2 + 2]
+    punpckhwd       m5, m1, m12
+    pmaddwd         m2, m5, [pd_31]
+    paddd           m9, m2
+    psubd           m2, m0, m5
+
+    punpcklwd       m1, m12
+    pmaddwd         m5, m1, [pd_31]
+    paddd           m8, m5
+    psubd           m3, m0, m1
+
+    movu            m1, [r2 + 18]
+    punpckhwd       m5, m1, m12
+    pmaddwd         m4, m5, [pd_31]
+    paddd           m11, m4
+    psubd           m4, m0, m5
+
+    punpcklwd       m1, m12
+    pmaddwd         m5, m1, [pd_31]
+    paddd           m10, m5
+    psubd           m5, m0, m1
+    mova            m16, m5
+
+    movu            m1, [r2 + 34]
+    punpckhwd       m6, m1, m12
+    psubd           m5, m0, m6
+    pmaddwd         m6, [pd_31]
+    paddd           m13, m6
+
+    punpcklwd       m6, m1, m12
+    psubd           m1, m0, m6
+    mova            m17, m1
+    pmaddwd         m6, [pd_31]
+    paddd           m7, m6
+
+    movu            m1, [r2 + 50]
+    mova            m18, m1
+    punpckhwd       m6, m1, m12
+    psubd           m1, m0, m6
+    pmaddwd         m6, [pd_31]
+    paddd           m15, m6
+
+    punpcklwd       m6, m18, m12
+    psubd           m12, m0, m6
+    mova            m19, m12
+    pmaddwd         m6, [pd_31]
+    paddd           m14, m6
+
+    add             r2, 130
+    lea             r5, [planar32_table]
+
+%macro INTRA_PRED_PLANAR32_sse2 0
+    movzx           r3d, word [r2]
+    movd            m0, r3d
+    pshufd          m0, m0, 0
+
+    pmaddwd         m6, m0, [r5]
+    pmaddwd         m12, m0, [r5 + 16]
+    paddd           m6, m8
+    paddd           m12, m9
+    paddd           m8, m3
+    paddd           m9, m2
+    psrad           m6, 6
+    psrad           m12, 6
+    packssdw        m6, m12
+    movu            [r0], m6
+
+    pmaddwd         m6, m0, [r5 + 32]
+    pmaddwd         m12, m0, [r5 + 48]
+    paddd           m6, m10
+    paddd           m12, m11
+    paddd           m10, m16
+    paddd           m11, m4
+    psrad           m6, 6
+    psrad           m12, 6
+    packssdw        m6, m12
+    movu            [r0 + 16], m6
+
+    pmaddwd         m6, m0, [r5 + 64]
+    pmaddwd         m12, m0, [r5 + 80]
+    paddd           m6, m7
+    paddd           m12, m13
+    paddd           m7, m17
+    paddd           m13, m5
+    psrad           m6, 6
+    psrad           m12, 6
+    packssdw        m6, m12
+    movu            [r0 + 32], m6
+
+    pmaddwd         m6, m0, [r5 + 96]
+    pmaddwd         m12, m0, [r5 + 112]
+    paddd           m6, m14
+    paddd           m12, m15
+    paddd           m14, m19
+    paddd           m15, m1
+    psrad           m6, 6
+    psrad           m12, 6
+    packssdw        m6, m12
+    movu            [r0 + 48], m6
+
+    lea             r0, [r0 + r1]
+    add             r2, 2
+%endmacro
+
+    mov             r4, 8
+.loop:
+    INTRA_PRED_PLANAR32_sse2
+    INTRA_PRED_PLANAR32_sse2
+    INTRA_PRED_PLANAR32_sse2
+    INTRA_PRED_PLANAR32_sse2
+    dec             r4
+    jnz             .loop
+    mov             rsp, r6
+    RET
+
+%else
+;code for BIT_DEPTH == 10
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
+INIT_XMM sse2
 cglobal intra_pred_planar32, 3,3,16
     movd            m3, [r2 + 66]               ; topRight   = above[32]
 
@@ -1036,6 +1208,7 @@
 %assign x x+1
 %endrep
     RET
+%endif
 
 ;---------------------------------------------------------------------------------------
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)


More information about the x265-devel mailing list