[x265] [PATCH] asm: avx2 assembly code for idct4x4

murugan at multicorewareinc.com murugan at multicorewareinc.com
Tue Sep 30 13:17:55 CEST 2014


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1412073901 -19800
#      Tue Sep 30 16:15:01 2014 +0530
# Node ID 0a46f4955806182667cd6e23e39ab1887b722e23
# Parent  5a6845566d1492d29af29ecc0cf75d644994735c
asm: avx2 assembly code for idct4x4

diff -r 5a6845566d14 -r 0a46f4955806 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Sep 29 17:37:47 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp	Tue Sep 30 16:15:01 2014 +0530
@@ -1448,6 +1448,7 @@
         p.dct[DCT_8x8] = x265_dct8_avx2;
         p.dct[DCT_16x16] = x265_dct16_avx2;
         p.dct[DCT_32x32] = x265_dct32_avx2;
+        p.idct[IDCT_4x4] = x265_idct4_avx2;
         p.idct[IDCT_8x8] = x265_idct8_avx2;
         p.idct[IDCT_16x16] = x265_idct16_avx2;
         p.idct[IDCT_32x32] = x265_idct32_avx2;
@@ -1785,6 +1786,7 @@
         p.dct[DCT_8x8] = x265_dct8_avx2;
         p.dct[DCT_16x16] = x265_dct16_avx2;
         p.dct[DCT_32x32] = x265_dct32_avx2;
+        p.idct[IDCT_4x4] = x265_idct4_avx2;
         p.idct[IDCT_8x8] = x265_idct8_avx2;
         p.idct[IDCT_16x16] = x265_idct16_avx2;
         p.idct[IDCT_32x32] = x265_idct32_avx2;
diff -r 5a6845566d14 -r 0a46f4955806 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Mon Sep 29 17:37:47 2014 -0500
+++ b/source/common/x86/dct8.asm	Tue Sep 30 16:15:01 2014 +0530
@@ -240,6 +240,15 @@
 avx2_dct4:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
                 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
 
+avx2_idct4_1:   dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
+                dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36 ,-83, 36, -83
+
+avx2_idct4_2:   dw 64, 64, 64, -64, 83, 36, 36, -83
+
+idct4_shuf1:    times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+
+idct4_shuf2:    times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11
+
 tab_dct4:       times 4 dw 64, 64
                 times 4 dw 83, 36
                 times 4 dw 64, -64
@@ -2601,4 +2610,75 @@
     dec             r4d
     jnz             .pass2
     RET
+
+;-------------------------------------------------------
+; void idct4(int32_t *src, int16_t *dst, intptr_t stride)
+;-------------------------------------------------------
+INIT_YMM avx2
+cglobal idct4, 3, 4, 6
+
+%define             IDCT_SHIFT1         7
+%if BIT_DEPTH == 10
+    %define         IDCT_SHIFT2        10
+    vpbroadcastd    m5,                [pd_512]
+%elif BIT_DEPTH == 8
+    %define         IDCT_SHIFT2        12
+    vpbroadcastd    m5,                [pd_2048]
+%else
+    %error Unsupported BIT_DEPTH!
 %endif
+    vbroadcasti128  m4, [pd_64]
+
+    add             r2d, r2d
+    lea             r3, [r2 * 3]
+
+    movu            m0, [r0]                      ;[00 01 02 03 10 11 12 13]
+    movu            m1, [r0 + 32]                 ;[20 21 22 23 30 31 32 33]
+
+    packssdw        m0, m1                        ;[00 01 02 03 20 21 22 23 10 11 12 13 30 31 32 33]
+    pshufb          m0, [idct4_shuf1]             ;[00 20 02 22 01 21 03 23 10 30 12 32 11 31 13 33]
+    vpermq          m2, m0, 0x44                  ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
+    vpermq          m0, m0, 0xEE                  ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]
+
+    mova            m1, [avx2_idct4_1]
+    mova            m3, [avx2_idct4_1 + 32]
+    pmaddwd         m1, m2
+    pmaddwd         m3, m0
+
+    paddd           m0, m1, m3
+    paddd           m0, m4
+    psrad           m0, IDCT_SHIFT1               ;[00 20 10 30 01 21 11 31]
+
+    psubd           m1, m3
+    paddd           m1, m4
+    psrad           m1, IDCT_SHIFT1               ;[03 23 13 33 02 22 12 32]
+
+    packssdw        m0, m1                        ;[00 20 10 30 03 23 13 33 01 21 11 31 02 22 12 32]
+    vmovshdup       m1, m0                        ;[10 30 10 30 13 33 13 33 11 31 11 31 12 32 12 32]
+    vmovsldup       m0, m0                        ;[00 20 00 20 03 23 03 23 01 21 01 21 02 22 02 22]
+
+    vpbroadcastq    m2, [avx2_idct4_2]
+    vpbroadcastq    m3, [avx2_idct4_2 + 8]
+    pmaddwd         m0, m2
+    pmaddwd         m1, m3
+
+    paddd           m2, m0, m1
+    paddd           m2, m5
+    psrad           m2, IDCT_SHIFT2               ;[00 01 10 11 30 31 20 21]
+
+    psubd           m0, m1
+    paddd           m0, m5
+    psrad           m0, IDCT_SHIFT2               ;[03 02 13 12 33 32 23 22]
+
+    pshufb          m0, [idct4_shuf2]             ;[02 03 12 13 32 33 22 23]
+    punpcklqdq      m1, m2, m0                    ;[00 01 02 03 10 11 12 13]
+    punpckhqdq      m2, m0                        ;[30 31 32 33 20 21 22 23]
+    packssdw        m1, m2                        ;[00 01 02 03 30 31 32 33 10 11 12 13 20 21 22 23]
+    vextracti128    xm0, m1, 1
+
+    movq            [r1], xm1
+    movq            [r1 + r2], xm0
+    movhps          [r1 + 2 * r2], xm0
+    movhps          [r1 + r3], xm1
+    RET
+%endif
diff -r 5a6845566d14 -r 0a46f4955806 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Mon Sep 29 17:37:47 2014 -0500
+++ b/source/common/x86/dct8.h	Tue Sep 30 16:15:01 2014 +0530
@@ -34,6 +34,7 @@
 
 void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
+void x265_idct4_avx2(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_idct8_avx2(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride);


More information about the x265-devel mailing list