[x265] [PATCH] asm: 16bpp asm code for intra_pred_ang4_3

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Mon Dec 9 09:00:03 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1386575960 -19800
#      Mon Dec 09 13:29:20 2013 +0530
# Node ID 4cbe7691e9aeb2c19b935087dab2c0f196b775d4
# Parent  96841a72f275447825a266ad02cb1a50738513e0
asm: 16bpp asm code for intra_pred_ang4_3

diff -r 96841a72f275 -r 4cbe7691e9ae source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Dec 09 13:15:43 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Dec 09 13:29:20 2013 +0530
@@ -685,6 +685,8 @@
         p.intra_pred[BLOCK_8x8][1] = x265_intra_pred_dc8_sse4;
         p.intra_pred[BLOCK_16x16][1] = x265_intra_pred_dc16_sse4;
         p.intra_pred[BLOCK_32x32][1] = x265_intra_pred_dc32_sse4;
+
+        SETUP_INTRA_ANG4(3, 3, sse4);
     }
     if (cpuMask & X265_CPU_XOP)
     {
diff -r 96841a72f275 -r 4cbe7691e9ae source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Mon Dec 09 13:15:43 2013 +0530
+++ b/source/common/x86/intrapred16.asm	Mon Dec 09 13:29:20 2013 +0530
@@ -426,3 +426,60 @@
     psrldq             m0, 6
     movh        [r0 + r1], m0
     RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang4_3, 3,4,8
+    cmp         r4m, byte 33
+    cmove       r2, r3mp
+    lea         r3, [ang_table + 20 * 32]
+    movu        m0, [r2 + 2]    ; [8 7 6 5 4 3 2 1]
+    palignr     m1, m0, 2       ; [x 8 7 6 5 4 3 2]
+    punpcklwd   m2, m0, m1      ; [5 4 4 3 3 2 2 1]
+    palignr     m5, m0, 4       ; [x x 8 7 6 5 4 3]
+    punpcklwd   m3, m1, m5      ; [6 5 5 4 4 3 3 2]
+    palignr     m1, m0, 6       ; [x x x 8 7 6 5 4]
+    punpcklwd   m4, m5 ,m1      ; [7 6 6 5 5 4 4 3]
+    movhps      m0, [r2 + 2]    ; [x x x x 8 7 6 5]
+    punpcklwd   m5, m1, m0      ; [8 7 7 6 6 5 5 4]
+
+    mova        m0, [r3 + 6 * 32]   ; [26]
+    mova        m1, [r3]            ; [20]
+    mova        m6, [r3 - 6 * 32]   ; [14]
+    mova        m7, [r3 - 12 * 32]  ; [ 8]
+
+ALIGN 32
+.do_filter4x4:
+    pmaddwd m2, m0
+    paddd   m2, [pd_16]
+    psrld   m2, 5
+
+    pmaddwd m3, m1
+    paddd   m3, [pd_16]
+    psrld   m3, 5
+    packusdw m2, m3
+
+    pmaddwd m4, m6
+    paddd   m4, [pd_16]
+    psrld   m4, 5
+
+    pmaddwd m5, m7
+    paddd   m5, [pd_16]
+    psrld   m5, 5
+    packusdw m4, m5
+
+    jz         .store
+
+    ; transpose 4x4
+    punpckhwd    m0, m2, m4
+    punpcklwd    m2, m4
+    punpckhwd    m4, m2, m0
+    punpcklwd    m2, m0
+
+.store:
+    add         r1, r1
+    movh        [r0], m2
+    movhps      [r0 + r1], m2
+    movh        [r0 + r1 * 2], m4
+    lea         r1, [r1 * 3]
+    movhps      [r0 + r1], m4
+    RET
\ No newline at end of file


More information about the x265-devel mailing list