[x265] [PATCH] asm: 16bpp asm code for intra_pred_ang4_3
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Mon Dec 9 09:00:03 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1386575960 -19800
# Mon Dec 09 13:29:20 2013 +0530
# Node ID 4cbe7691e9aeb2c19b935087dab2c0f196b775d4
# Parent 96841a72f275447825a266ad02cb1a50738513e0
asm: 16bpp asm code for intra_pred_ang4_3
diff -r 96841a72f275 -r 4cbe7691e9ae source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Dec 09 13:15:43 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Dec 09 13:29:20 2013 +0530
@@ -685,6 +685,8 @@
p.intra_pred[BLOCK_8x8][1] = x265_intra_pred_dc8_sse4;
p.intra_pred[BLOCK_16x16][1] = x265_intra_pred_dc16_sse4;
p.intra_pred[BLOCK_32x32][1] = x265_intra_pred_dc32_sse4;
+
+ SETUP_INTRA_ANG4(3, 3, sse4);
}
if (cpuMask & X265_CPU_XOP)
{
diff -r 96841a72f275 -r 4cbe7691e9ae source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Mon Dec 09 13:15:43 2013 +0530
+++ b/source/common/x86/intrapred16.asm Mon Dec 09 13:29:20 2013 +0530
@@ -426,3 +426,60 @@
psrldq m0, 6
movh [r0 + r1], m0
RET
+
+INIT_XMM sse4
+cglobal intra_pred_ang4_3, 3,4,8
+ cmp r4m, byte 33
+ cmove r2, r3mp
+ lea r3, [ang_table + 20 * 32]
+ movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
+ punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
+ palignr m5, m0, 4 ; [x x 8 7 6 5 4 3]
+ punpcklwd m3, m1, m5 ; [6 5 5 4 4 3 3 2]
+ palignr m1, m0, 6 ; [x x x 8 7 6 5 4]
+ punpcklwd m4, m5 ,m1 ; [7 6 6 5 5 4 4 3]
+ movhps m0, [r2 + 2] ; [x x x x 8 7 6 5]
+ punpcklwd m5, m1, m0 ; [8 7 7 6 6 5 5 4]
+
+ mova m0, [r3 + 6 * 32] ; [26]
+ mova m1, [r3] ; [20]
+ mova m6, [r3 - 6 * 32] ; [14]
+ mova m7, [r3 - 12 * 32] ; [ 8]
+
+ALIGN 32
+.do_filter4x4:
+ pmaddwd m2, m0
+ paddd m2, [pd_16]
+ psrld m2, 5
+
+ pmaddwd m3, m1
+ paddd m3, [pd_16]
+ psrld m3, 5
+ packusdw m2, m3
+
+ pmaddwd m4, m6
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m5, m7
+ paddd m5, [pd_16]
+ psrld m5, 5
+ packusdw m4, m5
+
+ jz .store
+
+ ; transpose 4x4
+ punpckhwd m0, m2, m4
+ punpcklwd m2, m4
+ punpckhwd m4, m2, m0
+ punpcklwd m2, m0
+
+.store:
+ add r1, r1
+ movh [r0], m2
+ movhps [r0 + r1], m2
+ movh [r0 + r1 * 2], m4
+ lea r1, [r1 * 3]
+ movhps [r0 + r1], m4
+ RET
\ No newline at end of file
More information about the x265-devel
mailing list