[x265] [PATCH] asm: 16bpp asm code for intra_pred_ang4 - mode 17, 18

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Tue Dec 10 13:23:33 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1386678181 -19800
#      Tue Dec 10 17:53:01 2013 +0530
# Node ID 30610cea9d7e411950ad4bdb6b27fe27ca2dc2be
# Parent  ee69fed0ed3b5d79546bdbd1ac864b3cdebb4bc9
asm: 16bpp asm code for intra_pred_ang4 - mode 17,18

diff -r ee69fed0ed3b -r 30610cea9d7e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Dec 10 17:05:01 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Dec 10 17:53:01 2013 +0530
@@ -741,6 +741,9 @@
         SETUP_INTRA_ANG4(14, 14, sse4);
         SETUP_INTRA_ANG4(15, 15, sse4);
         SETUP_INTRA_ANG4(16, 16, sse4);
+        SETUP_INTRA_ANG4(17, 17, sse4);
+        SETUP_INTRA_ANG4(18, 18, sse4);
+        SETUP_INTRA_ANG4(19, 17, sse4);
         SETUP_INTRA_ANG4(20, 16, sse4);
         SETUP_INTRA_ANG4(21, 15, sse4);
         SETUP_INTRA_ANG4(22, 14, sse4);
diff -r ee69fed0ed3b -r 30610cea9d7e source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Tue Dec 10 17:05:01 2013 +0530
+++ b/source/common/x86/const-a.asm	Tue Dec 10 17:53:01 2013 +0530
@@ -43,6 +43,7 @@
 const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
 const pb_unpackwq1, db 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3
 const pb_unpackwq2, db 4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7
+const pw_swap,      times 2 db 6,7,4,5,2,3,0,1
 
 const pb_01,       times  8 db 0,1
 const pb_0,        times 16 db 0
diff -r ee69fed0ed3b -r 30610cea9d7e source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Tue Dec 10 17:05:01 2013 +0530
+++ b/source/common/x86/intrapred16.asm	Tue Dec 10 17:53:01 2013 +0530
@@ -44,6 +44,7 @@
 cextern pw_4096
 cextern multiL
 cextern multi_2Row
+cextern pw_swap
 cextern pb_unpackwq1
 cextern pb_unpackwq2
 
@@ -916,3 +917,47 @@
     mova        m6, [r3 - 18 * 16]  ; [ 1]
     mova        m7, [r3 -  7 * 16]  ; [12]
     jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+
+cglobal intra_pred_ang4_17, 4,4,8
+    cmp         r4m, byte 19
+    jnz        .load
+    xchg        r2, r3
+.load
+    movu        m6, [r2 - 2]    ; [- - 4 3 2 1 0 x]
+    palignr     m2, m6, 2       ; [- - - 4 3 2 1 0]
+    palignr     m1, m6, 4       ; [- - - - 4 3 2 1]
+    mova        m4, m2
+    punpcklwd   m2, m1          ; [4 3 3 2 2 1 1 0]
+
+    pinsrw      m6, [r3 + 2], 0
+    punpcklwd   m3, m6, m4      ; [3 2 2 1 1 0 0 x]
+
+    pslldq      m4, m6, 2       ; [- 4 3 2 1 0 x y]
+    pinsrw      m4, [r3 + 4], 0
+    pslldq      m5, m4, 2       ; [4 3 2 1 0 x y z]
+    pinsrw      m5, [r3 + 8], 0
+    punpcklwd   m5, m4          ; [1 0 0 x x y y z]
+    punpcklwd   m4, m3          ; [2 1 1 0 0 x x y]
+
+    lea         r3, [ang_table + 14 * 16]
+    mova        m0, [r3 -  8 * 16]  ; [ 6]
+    mova        m1, [r3 -  2 * 16]  ; [12]
+    mova        m6, [r3 +  4 * 16]  ; [18]
+    mova        m7, [r3 + 10 * 16]  ; [24]
+    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+
+
+cglobal intra_pred_ang4_18, 4,4,1
+    movh        m0, [r2]
+    pshufb      m0, [pw_swap]
+    pinsrq      m0, [r3 + 2], 1
+    add         r1, r1
+    lea         r2, [r1 * 3]
+    movh        [r0 + r2], m0
+    psrldq      m0, 2
+    movh        [r0 + r1 * 2], m0
+    psrldq      m0, 2
+    movh        [r0 + r1], m0
+    psrldq      m0, 2
+    movh        [r0], m0
+    RET


More information about the x265-devel mailing list