[x265] [PATCH] asm: avx2 code for dst4x4

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Wed Apr 1 08:26:51 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1427803043 -19800
#      Tue Mar 31 17:27:23 2015 +0530
# Node ID ffa14b40f0fff3f6f22fe273458f2a4c83c50acf
# Parent  ac85c775620f1dcb0df056874633cbf916098bd2
asm: avx2 code for dst4x4

AVX2:
dst4x4                4.53x    277.57          1256.64

SSE2:
dst4x4                2.91x    431.31          1255.54

diff -r ac85c775620f -r ffa14b40f0ff source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Tue Mar 31 17:27:23 2015 +0530
@@ -1447,6 +1447,7 @@
 #if X86_64
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.dst4x4 = x265_dst4_avx2;
         p.scale2D_64to32 = x265_scale2D_64to32_avx2;
 
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
diff -r ac85c775620f -r ffa14b40f0ff source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/const-a.asm	Tue Mar 31 17:27:23 2015 +0530
@@ -105,6 +105,9 @@
 const multiH2,     dw 17, 18, 19, 20, 21, 22, 23, 24
 const multiH3,     dw 25, 26, 27, 28, 29, 30, 31, 32
 
+ALIGN 32
+const trans8_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
+
 const popcnt_table
 %assign x 0
 %rep 256
diff -r ac85c775620f -r ffa14b40f0ff source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/dct8.asm	Tue Mar 31 17:27:23 2015 +0530
@@ -261,6 +261,11 @@
                 times 2 dw 84, -29, -74, 55
                 times 2 dw 55, -84, 74, -29
 
+pw_dst4_tab:    times 4 dw 29,  55,  74,  84
+                times 4 dw 74,  74,   0, -74
+                times 4 dw 84, -29, -74,  55
+                times 4 dw 55, -84,  74, -29
+
 tab_idst4:      times 4 dw 29, +84
                 times 4 dw +74, +55
                 times 4 dw 55, -29
@@ -316,7 +321,7 @@
 cextern pd_1024
 cextern pd_2048
 cextern pw_ppppmmmm
-
+cextern trans8_shuf
 ;------------------------------------------------------
 ;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
 ;------------------------------------------------------
@@ -656,6 +661,59 @@
 
     RET
 
+;------------------------------------------------------------------
+;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
+;------------------------------------------------------------------
+INIT_YMM avx2
+cglobal dst4, 3, 4, 6
+%if BIT_DEPTH == 8
+  %define       DST_SHIFT 1
+  vpbroadcastd  m5, [pd_1]
+%elif BIT_DEPTH == 10
+  %define       DST_SHIFT 3
+  vpbroadcastd  m5, [pd_4]
+%endif
+    mova        m4, [trans8_shuf]
+    add         r2d, r2d
+    lea         r3, [pw_dst4_tab]
+
+    movq        xm0, [r0 + 0 * r2]
+    movhps      xm0, [r0 + 1 * r2]
+    lea         r0, [r0 + 2 * r2]
+    movq        xm1, [r0]
+    movhps      xm1, [r0 + r2]
+
+    vinserti128 m0, m0, xm1, 1          ; m0 = src[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+
+    pmaddwd     m2, m0, [r3 + 0 * 32]
+    pmaddwd     m1, m0, [r3 + 1 * 32]
+    phaddd      m2, m1
+    paddd       m2, m5
+    psrad       m2, DST_SHIFT
+    pmaddwd     m3, m0, [r3 + 2 * 32]
+    pmaddwd     m1, m0, [r3 + 3 * 32]
+    phaddd      m3, m1
+    paddd       m3, m5
+    psrad       m3, DST_SHIFT
+    packssdw    m2, m3
+    vpermd      m2, m4, m2
+
+    vpbroadcastd m5, [pd_128]
+    pmaddwd     m0, m2, [r3 + 0 * 32]
+    pmaddwd     m1, m2, [r3 + 1 * 32]
+    phaddd      m0, m1
+    paddd       m0, m5
+    psrad       m0, 8
+    pmaddwd     m3, m2, [r3 + 2 * 32]
+    pmaddwd     m2, m2, [r3 + 3 * 32]
+    phaddd      m3, m2
+    paddd       m3, m5
+    psrad       m3, 8
+    packssdw    m0, m3
+    vpermd      m0, m4, m0
+    movu        [r1], m0
+    RET
+
 ;-------------------------------------------------------
 ;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
 ;-------------------------------------------------------
diff -r ac85c775620f -r ffa14b40f0ff source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/dct8.h	Tue Mar 31 17:27:23 2015 +0530
@@ -26,6 +26,7 @@
 void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void x265_dct8_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dst4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
diff -r ac85c775620f -r ffa14b40f0ff source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/intrapred8.asm	Tue Mar 31 17:27:23 2015 +0530
@@ -58,7 +58,6 @@
 c_mode16_18:    db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
 
 ALIGN 32
-trans8_shuf:          dd 0, 4, 1, 5, 2, 6, 3, 7
 c_ang8_src1_9_2_10:   db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
 c_ang8_26_20:         db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
 c_ang8_src3_11_4_12:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
@@ -553,6 +552,7 @@
 cextern multiH2
 cextern multiH3
 cextern multi_2Row
+cextern trans8_shuf
 
 ;---------------------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)


More information about the x265-devel mailing list