[x265] [PATCH 1 of 2] asm : Adding asm routine for dst4
nabajit at multicorewareinc.com
nabajit at multicorewareinc.com
Thu Nov 28 13:24:58 CET 2013
# HG changeset patch
# User Nabajit Deka
# Date 1385641262 -19800
# Thu Nov 28 17:51:02 2013 +0530
# Node ID cb54626347bc69690c2a6ee2983e57b76314e3e2
# Parent 2ba6c26c9febdc8c57d3014c0cf98d4897d3992d
asm : Adding asm routine for dst4.
diff -r 2ba6c26c9feb -r cb54626347bc source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Thu Nov 28 15:04:04 2013 +0530
+++ b/source/common/x86/dct8.asm Thu Nov 28 17:51:02 2013 +0530
@@ -33,6 +33,11 @@
times 4 dw 64, -64
times 4 dw 36, -83
+tab_dst4: dw 29, 55, 74, 84, 29, 55, 74, 84
+ dw 74, 74, 0, -74, 74, 74, 0, -74
+ dw 84, -29, -74, 55, 84, -29, -74, 55
+ dw 55, -84, 74, -29, 55, -84, 74, -29
+
SECTION .text
cextern pd_1
@@ -219,3 +224,102 @@
movhps [r1 + r2], m1
RET
+
+;------------------------------------------------------
+;void dst4(int16_t *src, int32_t *dst, intptr_t stride)
+;------------------------------------------------------
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal dst4, 3, 4, 8+2
+%else ; ARCH_X86_64 = 0
+cglobal dst4, 3, 4, 8
+%endif ; ARCH_X86_64
+
+ %define coef0 m6
+ %define coef1 m7
+%if ARCH_X86_64
+ %define coef2 m8
+ %define coef3 m9
+%else
+ %define coef2 [r3 + 2 * 16]
+ %define coef3 [r3 + 3 * 16]
+%endif
+
+ add r2d, r2d
+ lea r3, [tab_dst4]
+
+ mova m5, [pd_1]
+
+ mova coef0, [r3 + 0 * 16]
+ mova coef1, [r3 + 1 * 16]
+%if ARCH_X86_64
+ mova coef2, [r3 + 2 * 16]
+ mova coef3, [r3 + 3 * 16]
+%endif
+
+ movh m0, [r0 + 0 * r2] ;load
+ movh m1, [r0 + 1 * r2]
+ punpcklqdq m0, m1
+
+ lea r0, [r0 + 2 * r2]
+ movh m1, [r0]
+ movh m2, [r0 + r2]
+ punpcklqdq m1, m2
+
+ pmaddwd m2, m0, coef0 ;DST1
+ pmaddwd m3, m1, coef0
+ phaddd m2, m3
+ paddd m2, m5
+ psrad m2, 1
+
+ pmaddwd m3, m0, coef1
+ pmaddwd m4, m1, coef1
+ phaddd m3, m4
+ paddd m3, m5
+ psrad m3, 1
+ packssdw m2, m3 ; m2 = T70
+
+ pmaddwd m3, m0, coef2
+ pmaddwd m4, m1, coef2
+ phaddd m3, m4
+ paddd m3, m5
+ psrad m3, 1
+
+ pmaddwd m0, coef3
+ pmaddwd m1, coef3
+ phaddd m0, m1
+ paddd m0, m5
+ psrad m0, 1
+ packssdw m3, m0 ; m3 = T71
+
+ mova m5, [pd_128]
+
+ pmaddwd m0, m2, coef0 ; DST2
+ pmaddwd m1, m3, coef0
+ phaddd m0, m1
+ paddd m0, m5
+ psrad m0, 8
+ movu [r1 + 0 * 16], m0
+
+ pmaddwd m0, m2, coef1
+ pmaddwd m1, m3, coef1
+ phaddd m0, m1
+ paddd m0, m5
+ psrad m0, 8
+ movu [r1 + 1 * 16], m0
+
+ pmaddwd m0, m2, coef2
+ pmaddwd m1, m3, coef2
+ phaddd m0, m1
+ paddd m0, m5
+ psrad m0, 8
+ movu [r1 + 2 * 16], m0
+
+ pmaddwd m2, coef3
+ pmaddwd m3, coef3
+ phaddd m2, m3
+ paddd m2, m5
+ psrad m2, 8
+ movu [r1 + 3 * 16], m2
+
+ RET
diff -r 2ba6c26c9feb -r cb54626347bc source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Thu Nov 28 15:04:04 2013 +0530
+++ b/source/common/x86/dct8.h Thu Nov 28 17:51:02 2013 +0530
@@ -26,5 +26,6 @@
void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
+void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
#endif // ifndef X265_DCT8_H
More information about the x265-devel
mailing list