[x265] [PATCH] asm: avx2 code for idst4x4

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Wed Apr 1 08:29:07 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1427803839 -19800
#      Tue Mar 31 17:40:39 2015 +0530
# Node ID cfc64c1861c282a2aa8dec3f0be127eaaf121469
# Parent  ffa14b40f0fff3f6f22fe273458f2a4c83c50acf
asm: avx2 code for idst4x4

AVX2:
idst4x4         7.03x    314.85          2213.26

SSE2:
idst4x4         4.31x    514.69          2219.20

diff -r ffa14b40f0ff -r cfc64c1861c2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Mar 31 17:27:23 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Mar 31 17:40:39 2015 +0530
@@ -1447,6 +1447,7 @@
 #if X86_64
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.idst4x4 = x265_idst4_avx2;
         p.dst4x4 = x265_dst4_avx2;
         p.scale2D_64to32 = x265_scale2D_64to32_avx2;
 
diff -r ffa14b40f0ff -r cfc64c1861c2 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Tue Mar 31 17:27:23 2015 +0530
+++ b/source/common/x86/dct8.asm	Tue Mar 31 17:40:39 2015 +0530
@@ -275,6 +275,16 @@
                 times 4 dw 84, +55
                 times 4 dw -74, -29
 
+pw_idst4_tab:   times 4 dw  29,  84
+                times 4 dw  55, -29
+                times 4 dw  74,  55
+                times 4 dw  74, -84
+                times 4 dw  74, -74
+                times 4 dw  84,  55
+                times 4 dw  0,   74
+                times 4 dw -74, -29
+pb_idst4_shuf:  times 2 db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+
 tab_dct8_1:     times 2 dw 89, 50, 75, 18
                 times 2 dw 75, -89, -18, -50
                 times 2 dw 50, 18, -89, 75
@@ -806,6 +816,81 @@
     movhps      [r1 + r2], m1
     RET
 
+;-----------------------------------------------------------------
+;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
+;-----------------------------------------------------------------
+INIT_YMM avx2
+cglobal idst4, 3, 4, 6
+%if BIT_DEPTH == 8
+  vpbroadcastd  m4, [pd_2048]
+  %define       IDCT4_SHIFT 12
+%elif BIT_DEPTH == 10
+  vpbroadcastd  m4, [pd_512]
+  %define       IDCT4_SHIFT 10
+%else
+  %error Unsupported BIT_DEPTH!
+%endif
+    add         r2d, r2d
+    lea         r3, [pw_idst4_tab]
+
+    movu        xm0, [r0 + 0 * 16]
+    movu        xm1, [r0 + 1 * 16]
+
+    punpcklwd   m2, m0, m1
+    punpckhwd   m0, m1
+
+    vinserti128 m2, m2, xm2, 1
+    vinserti128 m0, m0, xm0, 1
+
+    vpbroadcastd m5, [pd_64]
+    pmaddwd     m1, m2, [r3 + 0 * 32]
+    pmaddwd     m3, m0, [r3 + 1 * 32]
+    paddd       m1, m3
+    paddd       m1, m5
+    psrad       m1, 7
+    pmaddwd     m3, m2, [r3 + 2 * 32]
+    pmaddwd     m0, [r3 + 3 * 32]
+    paddd       m3, m0
+    paddd       m3, m5
+    psrad       m3, 7
+
+    packssdw    m0, m1, m3
+    pshufb      m0, [pb_idst4_shuf]
+    vpermq      m1, m0, 11101110b
+
+    punpcklwd   m2, m0, m1
+    punpckhwd   m0, m1
+    punpcklwd   m1, m2, m0
+    punpckhwd   m2, m0
+
+    vpermq      m1, m1, 01000100b
+    vpermq      m2, m2, 01000100b
+
+    pmaddwd     m0, m1, [r3 + 0 * 32]
+    pmaddwd     m3, m2, [r3 + 1 * 32]
+    paddd       m0, m3
+    paddd       m0, m4
+    psrad       m0, IDCT4_SHIFT
+    pmaddwd     m3, m1, [r3 + 2 * 32]
+    pmaddwd     m2, m2, [r3 + 3 * 32]
+    paddd       m3, m2
+    paddd       m3, m4
+    psrad       m3, IDCT4_SHIFT
+
+    packssdw    m0, m3
+    pshufb      m1, m0, [pb_idst4_shuf]
+    vpermq      m0, m1, 11101110b
+
+    punpcklwd   m2, m1, m0
+    movq        [r1 + 0 * r2], xm2
+    movhps      [r1 + 1 * r2], xm2
+
+    punpckhwd   m1, m0
+    movq        [r1 + 2 * r2], xm1
+    lea         r1, [r1 + 2 * r2]
+    movhps      [r1 + r2], xm1
+    RET
+
 ;-------------------------------------------------------
 ; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
 ;-------------------------------------------------------
diff -r ffa14b40f0ff -r cfc64c1861c2 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Tue Mar 31 17:27:23 2015 +0530
+++ b/source/common/x86/dct8.h	Tue Mar 31 17:40:39 2015 +0530
@@ -34,6 +34,7 @@
 void x265_dct32_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
 
 void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
+void x265_idst4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
 void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
 void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
 void x265_idct8_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);


More information about the x265-devel mailing list