[x265] [PATCH 1 of 2] asm : Adding asm routine for idst4
nabajit at multicorewareinc.com
nabajit at multicorewareinc.com
Fri Nov 29 13:31:05 CET 2013
# HG changeset patch
# User Nabajit Deka
# Date 1385728044 -19800
# Fri Nov 29 17:57:24 2013 +0530
# Node ID 189377dcf4a43a98f3a217d4db9866799068cb8d
# Parent 833d78aaf71edddf774605fefb8912aea3aeced6
asm : Adding asm routine for idst4
diff -r 833d78aaf71e -r 189377dcf4a4 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Fri Nov 29 16:40:42 2013 +0530
+++ b/source/common/x86/dct8.asm Fri Nov 29 17:57:24 2013 +0530
@@ -33,6 +33,15 @@
times 4 dw 64, -64
times 4 dw 36, -83
+tab_idst4: times 4 dw 29, +84
+ times 4 dw +74, +55
+ times 4 dw 55, -29
+ times 4 dw +74, -84
+ times 4 dw 74, -74
+ times 4 dw 0, +74
+ times 4 dw 84, +55
+ times 4 dw -74, -29
+
SECTION .text
cextern pd_1
@@ -219,3 +228,102 @@
movhps [r1 + r2], m1
RET
+
+;-------------------------------------------------------
+;void idst4(int32_t *src, int16_t *dst, intptr_t stride)
+;-------------------------------------------------------
+INIT_XMM sse2
+cglobal idst4, 3, 4, 6
+
+ add r2d, r2d
+ lea r3, [tab_idst4]
+
+ mova m5, [pd_64]
+
+ movu m0, [r0 + 0 * 16]
+ movu m1, [r0 + 1 * 16]
+ packssdw m0, m1
+
+ movu m1, [r0 + 2 * 16]
+ movu m2, [r0 + 3 * 16]
+ packssdw m1, m2
+
+ punpcklwd m2, m0, m1 ; m2 = m128iAC
+ punpckhwd m0, m1 ; m0 = m128iBD
+
+ pmaddwd m1, m2, [r3 + 0 * 16]
+ pmaddwd m3, m0, [r3 + 1 * 16]
+ paddd m1, m3
+ paddd m1, m5
+ psrad m1, 7 ; m1 = S0
+
+ pmaddwd m3, m2, [r3 + 2 * 16]
+ pmaddwd m4, m0, [r3 + 3 * 16]
+ paddd m3, m4
+ paddd m3, m5
+ psrad m3, 7 ; m3 = S8
+ packssdw m1, m3 ; m1 = m128iA
+
+ pmaddwd m3, m2, [r3 + 4 * 16]
+ pmaddwd m4, m0, [r3 + 5 * 16]
+ paddd m3, m4
+ paddd m3, m5
+ psrad m3, 7 ; m3 = S0
+
+ pmaddwd m2, [r3 + 6 * 16]
+ pmaddwd m0, [r3 + 7 * 16]
+ paddd m2, m0
+ paddd m2, m5
+ psrad m2, 7 ; m2 = S8
+ packssdw m3, m2 ; m3 = m128iD
+
+ punpcklwd m0, m1, m3
+ punpckhwd m1, m3
+
+ punpcklwd m2, m0, m1
+ punpckhwd m0, m1
+
+ mova m5, [pd_2048]
+
+ punpcklwd m1, m2, m0
+ punpckhwd m2, m0
+
+ pmaddwd m0, m1, [r3 + 0 * 16]
+ pmaddwd m3, m2, [r3 + 1 * 16]
+ paddd m0, m3
+ paddd m0, m5
+ psrad m0, 12 ; m1 = S0
+
+ pmaddwd m3, m1, [r3 + 2 * 16]
+ pmaddwd m4, m2, [r3 + 3 * 16]
+ paddd m3, m4
+ paddd m3, m5
+ psrad m3, 12 ; m3 = S8
+ packssdw m0, m3 ; m0 = m128iA
+
+ pmaddwd m3, m1, [r3 + 4 * 16]
+ pmaddwd m4, m2, [r3 + 5 * 16]
+ paddd m3, m4
+ paddd m3, m5
+ psrad m3, 12 ; m3 = S0
+
+ pmaddwd m1, [r3 + 6 * 16]
+ pmaddwd m2, [r3 + 7 * 16]
+ paddd m1, m2
+ paddd m1, m5
+ psrad m1, 12 ; m1 = S8
+ packssdw m3, m1 ; m3 = m128iD
+
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+
+ punpcklwd m2, m1, m0
+ movlps [r1 + 0 * r2], m2
+ movhps [r1 + 1 * r2], m2
+
+ punpckhwd m1, m0
+ movlps [r1 + 2 * r2], m1
+ lea r1, [r1 + 2 * r2]
+ movhps [r1 + r2], m1
+
+ RET
diff -r 833d78aaf71e -r 189377dcf4a4 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Fri Nov 29 16:40:42 2013 +0530
+++ b/source/common/x86/dct8.h Fri Nov 29 17:57:24 2013 +0530
@@ -26,5 +26,6 @@
void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
+void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
#endif // ifndef X265_DCT8_H
More information about the x265-devel
mailing list