[x265] [PATCH] asm: avx2 asm code for dct4

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Wed Aug 27 15:32:32 CEST 2014


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1409145968 -19800
#      Wed Aug 27 18:56:08 2014 +0530
# Node ID 27193515d4417c142fff97a1d96a3d7111b9d6d5
# Parent  77fe0cc583e8ec10275bc1b3c4bb116d5ceb51ac
asm: avx2 asm code for dct4
previous perf: 4.3x, with avx2: 5.4x

diff -r 77fe0cc583e8 -r 27193515d441 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Aug 27 14:25:17 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Aug 27 18:56:08 2014 +0530
@@ -1715,6 +1715,8 @@
         p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
         p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
         p.denoiseDct = x265_denoise_dct_avx2;
+
+        p.dct[DCT_4x4] = x265_dct4_avx2;
     }
 #endif // if HIGH_BIT_DEPTH
 }
diff -r 77fe0cc583e8 -r 27193515d441 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Wed Aug 27 14:25:17 2014 +0530
+++ b/source/common/x86/dct8.asm	Wed Aug 27 18:56:08 2014 +0530
@@ -35,6 +35,11 @@
                 times 4 dw 64, -64
                 times 4 dw 36, -83
 
+avx2_dct4:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
+                dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
+
+dct4_shuf:      db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
+
 tab_dst4:       times 2 dw 29, 55, 74, 84
                 times 2 dw 74, 74,  0, -74
                 times 2 dw 84, -29, -74, 55
@@ -187,6 +192,77 @@
 
     RET
 
+
+; DCT 4x4
+;
+; Input parameters:
+; - r0:     source
+; - r1:     destination
+; - r2:     source stride
+INIT_YMM avx2
+cglobal dct4, 3, 4, 8, src, dst, srcStride
+%if BIT_DEPTH == 10
+    %define DCT_SHIFT 3
+    vbroadcasti128 m7, [pd_4]
+%elif BIT_DEPTH == 8
+    %define DCT_SHIFT 1
+    vbroadcasti128 m7, [pd_1]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+    add             r2d, r2d
+    lea             r3, [avx2_dct4]
+
+    vbroadcasti128  m4, [dct4_shuf]
+    mova            m5, [r3]
+    mova            m6, [r3 + 32]
+    movq            xm0, [r0]
+    movq            xm1, [r0 + r2]
+    punpcklqdq      m0, m1
+    lea             r0, [r0 + 2 * r2]
+    movq            xm1, [r0]
+    movq            xm2, [r0 + r2]
+    punpcklqdq      m1, m2
+
+    vinserti128     m0, m0, xm1, 1
+    pshufb          m0, m4
+    vpermq          m1, m0, 11011101b
+    vpermq          m0, m0, 10001000b
+    paddw           m2, m0, m1
+    psubw           m0, m1
+
+    pmaddwd         m2, m5
+    paddd           m2, m7
+    psrad           m2, DCT_SHIFT
+
+    pmaddwd         m0, m6
+    paddd           m0, m7
+    psrad           m0, DCT_SHIFT
+
+    packssdw        m2, m0
+    pshufb          m2, m4
+    vpermq          m1, m2, 11011101b
+    vpermq          m2, m2, 10001000b
+    vbroadcasti128  m7, [pd_128]
+
+    pmaddwd         m0, m2, m5
+    pmaddwd         m3, m1, m5
+    paddd           m3, m0
+    paddd           m3, m7
+    psrad           m3, 8
+
+    pmaddwd         m2, m6
+    pmaddwd         m1, m6
+    psubd           m2, m1
+    paddd           m2, m7
+    psrad           m2, 8
+
+    vinserti128     m0, m3, xm2, 1
+    vpermq          m3, m3, 11101110b
+    vinserti128     m2, m2, xm3, 0
+    movu            [r1], m0
+    movu            [r1 + mmsize], m2
+    RET
 ;-------------------------------------------------------
 ;void idct4(int32_t *src, int16_t *dst, intptr_t stride)
 ;-------------------------------------------------------
diff -r 77fe0cc583e8 -r 27193515d441 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Wed Aug 27 14:25:17 2014 +0530
+++ b/source/common/x86/dct8.h	Wed Aug 27 18:56:08 2014 +0530
@@ -25,6 +25,7 @@
 #define X265_DCT8_H
 
 void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
+void x265_dct4_avx2(int16_t *src, int32_t *dst, intptr_t stride);
 void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);


More information about the x265-devel mailing list