[x265] [PATCH] asm: avx2 code for intra_dc_32x32
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Fri Apr 3 12:14:47 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1428055909 -19800
# Fri Apr 03 15:41:49 2015 +0530
# Node ID 83f44b5a99a1157683d63a18d05297a58437e7a3
# Parent cef7834897bc0d53981e5dfe8790bc207deb7346
asm: avx2 code for intra_dc_32x32
AVX2:
intra_dc_32x32[f=0] 23.17x 435.66 10093.78
SSE4:
intra_dc_32x32[f=0] 14.36x 703.46 10100.78
diff -r cef7834897bc -r 83f44b5a99a1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Apr 03 11:35:53 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Apr 03 15:41:49 2015 +0530
@@ -1471,6 +1471,8 @@
#if X86_64
if (cpuMask & X265_CPU_AVX2)
{
+ p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_avx2;
+
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_avx2;
p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_avx2;
diff -r cef7834897bc -r 83f44b5a99a1 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Fri Apr 03 11:35:53 2015 +0530
+++ b/source/common/x86/intrapred.h Fri Apr 03 15:41:49 2015 +0530
@@ -34,6 +34,7 @@
void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
+void x265_intra_pred_dc32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
void x265_intra_pred_planar4_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
void x265_intra_pred_planar8_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
diff -r cef7834897bc -r 83f44b5a99a1 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Fri Apr 03 11:35:53 2015 +0530
+++ b/source/common/x86/intrapred8.asm Fri Apr 03 15:41:49 2015 +0530
@@ -573,6 +573,7 @@
cextern pw_31
cextern pw_32
cextern pw_257
+cextern pw_512
cextern pw_1024
cextern pw_4096
cextern pw_00ff
@@ -2251,6 +2252,69 @@
RET
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal intra_pred_dc32, 3, 4, 3
+ lea r3, [r1 * 3]
+ pxor m0, m0
+ movu m1, [r2 + 1]
+ movu m2, [r2 + 65]
+ psadbw m1, m0
+ psadbw m2, m0
+ paddw m1, m2
+ vextracti128 xm2, m1, 1
+ paddw m1, m2
+ pshufd m2, m1, 2
+ paddw m1, m2
+
+ pmulhrsw m1, [pw_512] ; sum = (sum + 32) / 64
+ vpbroadcastb m1, xm1 ; m1 = byte [dc_val ...]
+
+ movu [r0 + r1 * 0], m1
+ movu [r0 + r1 * 1], m1
+ movu [r0 + r1 * 2], m1
+ movu [r0 + r3 * 1], m1
+ lea r0, [r0 + 4 * r1]
+ movu [r0 + r1 * 0], m1
+ movu [r0 + r1 * 1], m1
+ movu [r0 + r1 * 2], m1
+ movu [r0 + r3 * 1], m1
+ lea r0, [r0 + 4 * r1]
+ movu [r0 + r1 * 0], m1
+ movu [r0 + r1 * 1], m1
+ movu [r0 + r1 * 2], m1
+ movu [r0 + r3 * 1], m1
+ lea r0, [r0 + 4 * r1]
+ movu [r0 + r1 * 0], m1
+ movu [r0 + r1 * 1], m1
+ movu [r0 + r1 * 2], m1
+ movu [r0 + r3 * 1], m1
+ lea r0, [r0 + 4 * r1]
+ movu [r0 + r1 * 0], m1
+ movu [r0 + r1 * 1], m1
+ movu [r0 + r1 * 2], m1
+ movu [r0 + r3 * 1], m1
+ lea r0, [r0 + 4 * r1]
+ movu [r0 + r1 * 0], m1
+ movu [r0 + r1 * 1], m1
+ movu [r0 + r1 * 2], m1
+ movu [r0 + r3 * 1], m1
+ lea r0, [r0 + 4 * r1]
+ movu [r0 + r1 * 0], m1
+ movu [r0 + r1 * 1], m1
+ movu [r0 + r1 * 2], m1
+ movu [r0 + r3 * 1], m1
+ lea r0, [r0 + 4 * r1]
+ movu [r0 + r1 * 0], m1
+ movu [r0 + r1 * 1], m1
+ movu [r0 + r1 * 2], m1
+ movu [r0 + r3 * 1], m1
+ RET
+%endif ;; ARCH_X86_64 == 1
+
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
More information about the x265-devel
mailing list