[x265] [PATCH] asm: intra_pred_dc32 high_bit_depth code
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Mon May 4 15:17:18 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1430744823 -19800
# Mon May 04 18:37:03 2015 +0530
# Node ID 9ca2b6bcb92969ddefe75d04f5a6c2caf4fd994b
# Parent 4cf55e54fe3ec33f540b7678b02de34074c0527b
asm: intra_pred_dc32 high_bit_depth code
AVX2:
intra_dc_32x32 18.93x 801.28 15169.64
SSE:
intra_dc_32x32 10.41x 1457.15 15167.84
diff -r 4cf55e54fe3e -r 9ca2b6bcb929 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Sat May 02 10:58:05 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Mon May 04 18:37:03 2015 +0530
@@ -1181,6 +1181,8 @@
}
if (cpuMask & X265_CPU_AVX2)
{
+ p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_avx2;
+
p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2;
p.cu[BLOCK_16x16].sse_ss = x265_pixel_ssd_ss_16x16_avx2;
diff -r 4cf55e54fe3e -r 9ca2b6bcb929 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Sat May 02 10:58:05 2015 -0500
+++ b/source/common/x86/intrapred16.asm Mon May 04 18:37:03 2015 +0530
@@ -448,6 +448,107 @@
%endrep
RET
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal intra_pred_dc32, 3, 3, 2
+ add r1d, r1d
+ movu m0, [r2 + 130] ; 130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel)
+ movu m1, [r2 + 162] ; 162 = 130+32
+ paddw m0, m1
+ movu m1, [r2 + 2]
+ paddw m0, m1
+ movu m1, [r2 + 34]
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ movhlps xm1, xm0
+ paddw m0, m1
+ pshuflw xm1, xm0, 0x6E
+ paddw m0, m1
+ pmaddwd m0, [pw_1]
+ paddd m0, [pd_32] ; sum = sum + 32
+ psrld m0, 6 ; sum = sum / 64
+ pshuflw xm0, xm0, 0
+ pshufd m0, m0, 0
+ vinserti128 m0, m0, xm0, 1
+
+ lea r2, [r1 * 3]
+ ; store DC 32x32
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + 32], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + 32], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + 32], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + 32], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + 32], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + 32], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + 32], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + 32], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + 32], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + 32], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + 32], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + 32], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + 32], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + 32], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + 32], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + 32], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + 32], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + 32], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + 32], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + 32], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + 32], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + 32], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + 32], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + 32], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + 32], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + 32], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + 32], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + 32], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + 32], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + 32], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + 32], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + 32], m0
+ RET
+
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
More information about the x265-devel
mailing list