[x265] [PATCH 2 of 2] asm: Adding asm routine for idct4

nabajit at multicorewareinc.com nabajit at multicorewareinc.com
Wed Nov 27 14:11:34 CET 2013


# HG changeset patch
# User Nabajit Deka
# Date 1385557692 -19800
#      Wed Nov 27 18:38:12 2013 +0530
# Branch stable
# Node ID e4206a37c20f531312013d2a5879f6dbb58c05c5
# Parent  648c669afd7476f30e4f432d839b36fbb5390332
asm: Adding asm routine for idct4

diff -r 648c669afd74 -r e4206a37c20f source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Wed Nov 27 18:19:18 2013 +0530
+++ b/source/common/x86/dct8.asm	Wed Nov 27 18:38:12 2013 +0530
@@ -21,6 +21,8 @@
 ;* For more information, contact us at licensing at multicorewareinc.com.
 ;*****************************************************************************/
 
+;TO-DO : Further optimize the routines.
+
 %include "x86inc.asm"
 %include "x86util.asm"
 
@@ -34,7 +36,9 @@
 SECTION .text
 
 cextern pd_1
+cextern pd_64
 cextern pd_128
+cextern pd_2048
 
 ;------------------------------------------------------
 ;void dct4(int16_t *src, int32_t *dst, intptr_t stride)
@@ -128,3 +132,90 @@
     movu        [r1 + 3 * 16], m2
 
     RET
+
+;-------------------------------------------------------
+;void idct4(int32_t *src, int16_t *dst, intptr_t stride)
+;-------------------------------------------------------
+INIT_XMM sse2
+cglobal idct4, 3, 4, 7
+
+    add         r2d, r2d
+    lea         r3, [tab_dct4]
+
+    mova        m6, [pd_64]
+
+    movu        m0, [r0 + 0 * 16]
+    movu        m1, [r0 + 1 * 16]
+    packssdw    m0, m1
+
+    movu        m1, [r0 + 2 * 16]
+    movu        m2, [r0 + 3 * 16]
+    packssdw    m1, m2
+
+    punpcklwd   m2, m0, m1
+    pmaddwd     m3, m2, [r3 + 0 * 16]       ; m3 = E1
+    paddd       m3, m6
+
+    pmaddwd     m2, [r3 + 2 * 16]           ; m2 = E2
+    paddd       m2, m6
+
+    punpckhwd   m0, m1
+    pmaddwd     m1, m0, [r3 + 1 * 16]       ; m1 = O1
+    pmaddwd     m0, [r3 + 3 * 16]           ; m0 = O2
+
+    paddd       m4, m3, m1
+    psrad       m4, 7                       ; m4 = m128iA
+    paddd       m5, m2, m0
+    psrad       m5, 7
+    packssdw    m4, m5                      ; m4 = m128iA
+
+    psubd       m2, m0
+    psrad       m2, 7
+    psubd       m3, m1
+    psrad       m3, 7
+    packssdw    m2, m3                      ; m2 = m128iD
+
+    punpcklwd   m1, m4, m2                  ; m1 = S0
+    punpckhwd   m4, m2                      ; m4 = S8
+
+    punpcklwd   m0, m1, m4                  ; m0 = m128iA
+    punpckhwd   m1, m4                      ; m1 = m128iD
+
+    mova        m6, [pd_2048]
+
+    punpcklwd   m2, m0, m1
+    pmaddwd     m3, m2, [r3 + 0 * 16]
+    paddd       m3, m6                      ; m3 = E1
+
+    pmaddwd     m2, [r3 + 2 * 16]
+    paddd       m2, m6                      ; m2 = E2
+
+    punpckhwd   m0, m1
+    pmaddwd     m1, m0, [r3 + 1 * 16]       ; m1 = O1
+    pmaddwd     m0, [r3 + 3 * 16]           ; m0 = O2
+
+    paddd       m4, m3, m1
+    psrad       m4, 12                      ; m4 = m128iA
+    paddd       m5, m2, m0
+    psrad       m5, 12
+    packssdw    m4, m5                      ; m4 = m128iA
+
+    psubd       m2, m0
+    psrad       m2, 12
+    psubd       m3, m1
+    psrad       m3, 12
+    packssdw    m2, m3                      ; m2 = m128iD
+
+    punpcklwd   m1, m4, m2
+    punpckhwd   m4, m2
+
+    punpcklwd   m0, m1, m4
+    movlps      [r1 + 0 * r2], m0
+    movhps      [r1 + 1 * r2], m0
+
+    punpckhwd   m1, m4
+    movlps      [r1 + 2 * r2], m1
+    lea         r1, [r1 + 2 * r2]
+    movhps      [r1 + r2], m1
+
+    RET


More information about the x265-devel mailing list