[x265] [PATCH] asm: avx2 asm code for dct4

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Thu Aug 28 07:47:22 CEST 2014


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1409204690 -19800
#      Thu Aug 28 11:14:50 2014 +0530
# Node ID 580cd21422b84fad1b4a718dddcd79e23bfdf734
# Parent  b18ae1fe86b8774344695516b649fd44e33d17b3
asm: avx2 asm code for dct4

diff -r b18ae1fe86b8 -r 580cd21422b8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Aug 27 16:54:39 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp	Thu Aug 28 11:14:50 2014 +0530
@@ -1432,6 +1432,7 @@
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.dct[DCT_4x4] = x265_dct4_avx2;
     }
 
     /* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
@@ -1715,6 +1716,8 @@
         p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
         p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
         p.denoiseDct = x265_denoise_dct_avx2;
+
+        p.dct[DCT_4x4] = x265_dct4_avx2;
     }
 #endif // if HIGH_BIT_DEPTH
 }
diff -r b18ae1fe86b8 -r 580cd21422b8 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Wed Aug 27 16:54:39 2014 -0500
+++ b/source/common/x86/dct8.asm	Thu Aug 28 11:14:50 2014 +0530
@@ -35,6 +35,11 @@
                 times 4 dw 64, -64
                 times 4 dw 36, -83
 
+avx2_dct4:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
+                dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
+
+dct4_shuf:      db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
+
 tab_dst4:       times 2 dw 29, 55, 74, 84
                 times 2 dw 74, 74,  0, -74
                 times 2 dw 84, -29, -74, 55
@@ -187,6 +192,77 @@
 
     RET
 
+
+; DCT 4x4
+;
+; Input parameters:
+; - r0:     source
+; - r1:     destination
+; - r2:     source stride
+INIT_YMM avx2
+cglobal dct4, 3, 4, 8, src, dst, srcStride
+%if BIT_DEPTH == 10
+    %define DCT_SHIFT 3
+    vbroadcasti128 m7, [pd_4]
+%elif BIT_DEPTH == 8
+    %define DCT_SHIFT 1
+    vbroadcasti128 m7, [pd_1]
+%else
+    %error Unsupported BIT_DEPTH!
+%endif
+    add             r2d, r2d
+    lea             r3, [avx2_dct4]
+
+    vbroadcasti128  m4, [dct4_shuf]
+    mova            m5, [r3]
+    mova            m6, [r3 + 32]
+    movq            xm0, [r0]
+    movq            xm1, [r0 + r2]
+    punpcklqdq      m0, m1
+    lea             r0, [r0 + 2 * r2]
+    movq            xm1, [r0]
+    movq            xm2, [r0 + r2]
+    punpcklqdq      m1, m2
+
+    vinserti128     m0, m0, xm1, 1
+    pshufb          m0, m4
+    vpermq          m1, m0, 11011101b
+    vpermq          m0, m0, 10001000b
+    paddw           m2, m0, m1
+    psubw           m0, m1
+
+    pmaddwd         m2, m5
+    paddd           m2, m7
+    psrad           m2, DCT_SHIFT
+
+    pmaddwd         m0, m6
+    paddd           m0, m7
+    psrad           m0, DCT_SHIFT
+
+    packssdw        m2, m0
+    pshufb          m2, m4
+    vpermq          m1, m2, 11011101b
+    vpermq          m2, m2, 10001000b
+    vbroadcasti128  m7, [pd_128]
+
+    pmaddwd         m0, m2, m5
+    pmaddwd         m3, m1, m5
+    paddd           m3, m0
+    paddd           m3, m7
+    psrad           m3, 8
+
+    pmaddwd         m2, m6
+    pmaddwd         m1, m6
+    psubd           m2, m1
+    paddd           m2, m7
+    psrad           m2, 8
+
+    movu            [r1], xm3
+    movu            [r1 + mmsize/2], m2
+    vextracti128    [r1 + mmsize], m3, 1
+    vextracti128    [r1 + mmsize + mmsize/2], m2, 1
+
+    RET
 ;-------------------------------------------------------
 ;void idct4(int32_t *src, int16_t *dst, intptr_t stride)
 ;-------------------------------------------------------
diff -r b18ae1fe86b8 -r 580cd21422b8 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Wed Aug 27 16:54:39 2014 -0500
+++ b/source/common/x86/dct8.h	Thu Aug 28 11:14:50 2014 +0530
@@ -25,6 +25,7 @@
 #define X265_DCT8_H
 
 void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
+void x265_dct4_avx2(int16_t *src, int32_t *dst, intptr_t stride);
 void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);


More information about the x265-devel mailing list