[x265] [PATCH] asm: avx2 code for intra_dc_32x32

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Apr 3 12:14:47 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1428055909 -19800
#      Fri Apr 03 15:41:49 2015 +0530
# Node ID 83f44b5a99a1157683d63a18d05297a58437e7a3
# Parent  cef7834897bc0d53981e5dfe8790bc207deb7346
asm: avx2 code for intra_dc_32x32

AVX2:
intra_dc_32x32[f=0]     23.17x   435.66          10093.78

SSE4:
intra_dc_32x32[f=0]     14.36x   703.46          10100.78

diff -r cef7834897bc -r 83f44b5a99a1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Apr 03 11:35:53 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Apr 03 15:41:49 2015 +0530
@@ -1471,6 +1471,8 @@
 #if X86_64
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_avx2;
+
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_avx2;
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_avx2;
 
diff -r cef7834897bc -r 83f44b5a99a1 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Fri Apr 03 11:35:53 2015 +0530
+++ b/source/common/x86/intrapred.h	Fri Apr 03 15:41:49 2015 +0530
@@ -34,6 +34,7 @@
 void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
 void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
 void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
+void x265_intra_pred_dc32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
 
 void x265_intra_pred_planar4_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
 void x265_intra_pred_planar8_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
diff -r cef7834897bc -r 83f44b5a99a1 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Fri Apr 03 11:35:53 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Fri Apr 03 15:41:49 2015 +0530
@@ -573,6 +573,7 @@
 cextern pw_31
 cextern pw_32
 cextern pw_257
+cextern pw_512
 cextern pw_1024
 cextern pw_4096
 cextern pw_00ff
@@ -2251,6 +2252,69 @@
 
     RET
 
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal intra_pred_dc32, 3, 4, 3
+    lea             r3, [r1 * 3]
+    pxor            m0, m0
+    movu            m1, [r2 + 1]
+    movu            m2, [r2 + 65]
+    psadbw          m1, m0
+    psadbw          m2, m0
+    paddw           m1, m2
+    vextracti128    xm2, m1, 1
+    paddw           m1, m2
+    pshufd          m2, m1, 2
+    paddw           m1, m2
+
+    pmulhrsw        m1, [pw_512]    ; sum = (sum + 32) / 64
+    vpbroadcastb    m1, xm1         ; m1 = byte [dc_val ...]
+
+    movu            [r0 + r1 * 0], m1
+    movu            [r0 + r1 * 1], m1
+    movu            [r0 + r1 * 2], m1
+    movu            [r0 + r3 * 1], m1
+    lea             r0, [r0 + 4 * r1]
+    movu            [r0 + r1 * 0], m1
+    movu            [r0 + r1 * 1], m1
+    movu            [r0 + r1 * 2], m1
+    movu            [r0 + r3 * 1], m1
+    lea             r0, [r0 + 4 * r1]
+    movu            [r0 + r1 * 0], m1
+    movu            [r0 + r1 * 1], m1
+    movu            [r0 + r1 * 2], m1
+    movu            [r0 + r3 * 1], m1
+    lea             r0, [r0 + 4 * r1]
+    movu            [r0 + r1 * 0], m1
+    movu            [r0 + r1 * 1], m1
+    movu            [r0 + r1 * 2], m1
+    movu            [r0 + r3 * 1], m1
+    lea             r0, [r0 + 4 * r1]
+    movu            [r0 + r1 * 0], m1
+    movu            [r0 + r1 * 1], m1
+    movu            [r0 + r1 * 2], m1
+    movu            [r0 + r3 * 1], m1
+    lea             r0, [r0 + 4 * r1]
+    movu            [r0 + r1 * 0], m1
+    movu            [r0 + r1 * 1], m1
+    movu            [r0 + r1 * 2], m1
+    movu            [r0 + r3 * 1], m1
+    lea             r0, [r0 + 4 * r1]
+    movu            [r0 + r1 * 0], m1
+    movu            [r0 + r1 * 1], m1
+    movu            [r0 + r1 * 2], m1
+    movu            [r0 + r3 * 1], m1
+    lea             r0, [r0 + 4 * r1]
+    movu            [r0 + r1 * 0], m1
+    movu            [r0 + r1 * 1], m1
+    movu            [r0 + r1 * 2], m1
+    movu            [r0 + r3 * 1], m1
+    RET
+%endif ;; ARCH_X86_64 == 1
+
 ;---------------------------------------------------------------------------------------
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
 ;---------------------------------------------------------------------------------------


More information about the x265-devel mailing list