[x265] [PATCH] asm: 16bpp asm code for intra_pred_ang4 - mode 17, 18
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Tue Dec 10 14:25:01 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1386681883 -19800
# Tue Dec 10 18:54:43 2013 +0530
# Node ID 084b0a9093329ff4ee0984d0e7a4ba29b4d9beee
# Parent ee69fed0ed3b5d79546bdbd1ac864b3cdebb4bc9
asm: 16bpp asm code for intra_pred_ang4 - mode 17,18
diff -r ee69fed0ed3b -r 084b0a909332 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 10 17:05:01 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Dec 10 18:54:43 2013 +0530
@@ -741,6 +741,9 @@
SETUP_INTRA_ANG4(14, 14, sse4);
SETUP_INTRA_ANG4(15, 15, sse4);
SETUP_INTRA_ANG4(16, 16, sse4);
+ SETUP_INTRA_ANG4(17, 17, sse4);
+ SETUP_INTRA_ANG4(18, 18, sse4);
+ SETUP_INTRA_ANG4(19, 17, sse4);
SETUP_INTRA_ANG4(20, 16, sse4);
SETUP_INTRA_ANG4(21, 15, sse4);
SETUP_INTRA_ANG4(22, 14, sse4);
diff -r ee69fed0ed3b -r 084b0a909332 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Tue Dec 10 17:05:01 2013 +0530
+++ b/source/common/x86/const-a.asm Tue Dec 10 18:54:43 2013 +0530
@@ -43,6 +43,7 @@
const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
const pb_unpackwq1, db 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3
const pb_unpackwq2, db 4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7
+const pw_swap, times 2 db 6,7,4,5,2,3,0,1
const pb_01, times 8 db 0,1
const pb_0, times 16 db 0
diff -r ee69fed0ed3b -r 084b0a909332 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Tue Dec 10 17:05:01 2013 +0530
+++ b/source/common/x86/intrapred16.asm Tue Dec 10 18:54:43 2013 +0530
@@ -44,6 +44,7 @@
cextern pw_4096
cextern multiL
cextern multi_2Row
+cextern pw_swap
cextern pb_unpackwq1
cextern pb_unpackwq2
@@ -916,3 +917,47 @@
mova m6, [r3 - 18 * 16] ; [ 1]
mova m7, [r3 - 7 * 16] ; [12]
jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+
+cglobal intra_pred_ang4_17, 4,4,8
+ cmp r4m, byte 19
+ jnz .load
+ xchg r2, r3
+.load
+ movu m6, [r2 - 2] ; [- - 4 3 2 1 0 x]
+ palignr m2, m6, 2 ; [- - - 4 3 2 1 0]
+ palignr m1, m6, 4 ; [- - - - 4 3 2 1]
+ mova m4, m2
+ punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
+
+ pinsrw m6, [r3 + 2], 0
+ punpcklwd m3, m6, m4 ; [3 2 2 1 1 0 0 x]
+
+ pslldq m4, m6, 2 ; [- 4 3 2 1 0 x y]
+ pinsrw m4, [r3 + 4], 0
+ pslldq m5, m4, 2 ; [4 3 2 1 0 x y z]
+ pinsrw m5, [r3 + 8], 0
+ punpcklwd m5, m4 ; [1 0 0 x x y y z]
+ punpcklwd m4, m3 ; [2 1 1 0 0 x x y]
+
+ lea r3, [ang_table + 14 * 16]
+ mova m0, [r3 - 8 * 16] ; [ 6]
+ mova m1, [r3 - 2 * 16] ; [12]
+ mova m6, [r3 + 4 * 16] ; [18]
+ mova m7, [r3 + 10 * 16] ; [24]
+ jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+
+
+cglobal intra_pred_ang4_18, 4,4,1
+ movh m0, [r2]
+ pshufb m0, [pw_swap]
+ movhps m0, [r3 + 2]
+ add r1, r1
+ lea r2, [r1 * 3]
+ movh [r0 + r2], m0
+ psrldq m0, 2
+ movh [r0 + r1 * 2], m0
+ psrldq m0, 2
+ movh [r0 + r1], m0
+ psrldq m0, 2
+ movh [r0], m0
+ RET
More information about the x265-devel
mailing list