[x265] [PATCH 1 of 3] asm: Adding asm and header files for dct asm primitives

nabajit at multicorewareinc.com nabajit at multicorewareinc.com
Tue Nov 26 15:07:52 CET 2013


# HG changeset patch
# User Nabajit Deka
# Date 1385474627 -19800
#      Tue Nov 26 19:33:47 2013 +0530
# Node ID cdae16d2ebf3da0df9f7ec6af758bc34f6b2de12
# Parent  40d314225757b9a6009c98f456bd64d15c169b8c
asm: Adding asm and header files for dct asm primitives.

diff -r 40d314225757 -r cdae16d2ebf3 source/common/x86/dct8.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/dct8.asm	Tue Nov 26 19:33:47 2013 +0530
@@ -0,0 +1,130 @@
+;*****************************************************************************
+;* Copyright (C) 2013 x265 project
+;*
+;* Authors: Nabajit Deka <nabajit at multicorewareinc.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing at multicorewareinc.com.
+;*****************************************************************************/
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA 32
+
+tab_dct4:       times 4 dw 64, 64
+                times 4 dw 83, 36
+                times 4 dw 64, -64
+                times 4 dw 36, -83
+
+SECTION .text
+
+cextern pd_1
+cextern pd_128
+
+;------------------------------------------------------
+;void dct4(int16_t *src, int32_t *dst, intptr_t stride)
+;------------------------------------------------------
+INIT_XMM sse2
+cglobal dct4, 3, 4, 8
+
+    add         r2d, r2d
+    lea         r3, [tab_dct4]
+
+    mova        m4, [r3 + 0 * 16]
+    mova        m5, [r3 + 1 * 16]
+    mova        m6, [r3 + 2 * 16]
+
+    mova        m7, [pd_1]
+
+    movh        m0, [r0 + 0 * r2]
+    movh        m1, [r0 + 1 * r2]
+    punpcklqdq  m0, m1
+    pshufd      m0, m0, 0xD8
+    pshufhw     m0, m0, 0xB1
+
+    lea         r0, [r0 + 2 * r2]
+    movh        m1, [r0]
+    movh        m2, [r0 + r2]
+    punpcklqdq  m1, m2
+    pshufd      m1, m1, 0xD8
+    pshufhw     m1, m1, 0xB1
+
+    punpcklqdq  m2, m0, m1
+    punpckhqdq  m0, m1
+
+    paddw       m1, m2, m0
+    psubw       m2, m0
+
+    pmaddwd     m0, m1, m4
+    paddd       m0, m7
+    psrad       m0, 1
+
+    pmaddwd     m3, m2, m5
+    paddd       m3, m7
+    psrad       m3, 1
+
+    packssdw    m0, m3
+    pshufd      m0, m0, 0xD8
+    pshufhw     m0, m0, 0xB1
+
+    pmaddwd     m1, m6
+    paddd       m1, m7
+    psrad       m1, 1
+
+    pmaddwd     m2, [r3 + 3 * 16]
+    paddd       m2, m7
+    psrad       m2, 1
+
+    packssdw    m1, m2
+    pshufd      m1, m1, 0xD8
+    pshufhw     m1, m1, 0xB1
+
+    punpcklqdq  m2, m0, m1
+    punpckhqdq  m0, m1
+
+    mova        m7, [pd_128]
+
+    pmaddwd     m1, m2, m4
+    pmaddwd     m3, m0, m4
+    paddd       m1, m3
+    paddd       m1, m7
+    psrad       m1, 8
+    movu        [r1 + 0 * 16], m1
+
+    pmaddwd     m1, m2, m5
+    pmaddwd     m3, m0, m5
+    psubd       m1, m3
+    paddd       m1, m7
+    psrad       m1, 8
+    movu        [r1 + 1 * 16], m1
+
+    pmaddwd     m1, m2, m6
+    pmaddwd     m3, m0, m6
+    paddd       m1, m3
+    paddd       m1, m7
+    psrad       m1, 8
+    movu        [r1 + 2 * 16], m1
+
+    pmaddwd     m2, [r3 + 3 * 16]
+    pmaddwd     m0, [r3 + 3 * 16]
+    psubd       m2, m0
+    paddd       m2, m7
+    psrad       m2, 8
+    movu        [r1 + 3 * 16], m2
+
+    RET
diff -r 40d314225757 -r cdae16d2ebf3 source/common/x86/dct8.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/dct8.h	Tue Nov 26 19:33:47 2013 +0530
@@ -0,0 +1,29 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Nabajit Deka <nabajit at multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at multicorewareinc.com.
+ *****************************************************************************/
+
+#ifndef X265_DCT8_H
+#define X265_DCT8_H
+
+void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
+
+#endif // ifndef X265_DCT8_H


More information about the x265-devel mailing list