[x265] [PATCH] asm: intra_pred_dc32 high_bit_depth code
chen
chenm003 at 163.com
Mon May 4 19:27:54 CEST 2015
At 2015-05-04 21:17:18,dnyaneshwar at multicorewareinc.com wrote:
># HG changeset patch
># User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
># Date 1430744823 -19800
># Mon May 04 18:37:03 2015 +0530
># Node ID 9ca2b6bcb92969ddefe75d04f5a6c2caf4fd994b
># Parent 4cf55e54fe3ec33f540b7678b02de34074c0527b
>asm: intra_pred_dc32 high_bit_depth code
>
>AVX2:
>intra_dc_32x32 18.93x 801.28 15169.64
>
>SSE:
>intra_dc_32x32 10.41x 1457.15 15167.84
>
>diff -r 4cf55e54fe3e -r 9ca2b6bcb929 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Sat May 02 10:58:05 2015 -0500
>+++ b/source/common/x86/asm-primitives.cpp Mon May 04 18:37:03 2015 +0530
>@@ -1181,6 +1181,8 @@
> }
> if (cpuMask & X265_CPU_AVX2)
> {
>+ p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_avx2;
>+
> p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2;
> p.cu[BLOCK_16x16].sse_ss = x265_pixel_ssd_ss_16x16_avx2;
>
>diff -r 4cf55e54fe3e -r 9ca2b6bcb929 source/common/x86/intrapred16.asm
>--- a/source/common/x86/intrapred16.asm Sat May 02 10:58:05 2015 -0500
>+++ b/source/common/x86/intrapred16.asm Mon May 04 18:37:03 2015 +0530
>@@ -448,6 +448,107 @@
> %endrep
> RET
>
>+;---------------------------------------------------------------------------------------------
>+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
>+;---------------------------------------------------------------------------------------------
>+INIT_YMM avx2
>+cglobal intra_pred_dc32, 3, 3, 2
>+ add r1d, r1d
>+ movu m0, [r2 + 130] ; 130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel)
>+ movu m1, [r2 + 162] ; 162 = 130+32
prepare adjust r2 to r2+128 can reduce offset size field
>+ paddw m0, m1
>+ movu m1, [r2 + 2]
>+ paddw m0, m1
>+ movu m1, [r2 + 34]
>+ paddw m0, m1
>+ vextracti128 xm1, m0, 1
>+ paddw xm0, xm1
>+ movhlps xm1, xm0
>+ paddw m0, m1
>+ pshuflw xm1, xm0, 0x6E
>+ paddw m0, m1
phaddw can reduce one instruction (with more uops)
>+ pmaddwd m0, [pw_1]
>+ paddd m0, [pd_32] ; sum = sum + 32
>+ psrld m0, 6 ; sum = sum / 64
note: dynamic range is 64 of 10bits value, the output is 16bits unsigned, in 12bpp mode need careful on above value, it maybe overflow
>+ pshuflw xm0, xm0, 0
>+ pshufd m0, m0, 0
>+ vinserti128 m0, m0, xm0, 1
vpbroadcastw
>+
>+ lea r2, [r1 * 3]
>+ ; store DC 32x32
>+ movu [r0 + r1 * 0 + 0], m0
>+ movu [r0 + r1 * 0 + 32], m0
32 = mmsize
>+ movu [r0 + r1 * 1 + 0], m0
>+ movu [r0 + r1 * 1 + 32], m0
>+ movu [r0 + r1 * 2 + 0], m0
>+ movu [r0 + r1 * 2 + 32], m0
>+ movu [r0 + r2 * 1 + 0], m0
>+ movu [r0 + r2 * 1 + 32], m0
>+ lea r0, [r0 + r1 * 4]
>+ movu [r0 + r1 * 0 + 0], m0
>+ movu [r0 + r1 * 0 + 32], m0
>+ movu [r0 + r1 * 1 + 0], m0
>+ movu [r0 + r1 * 1 + 32], m0
>+ movu [r0 + r1 * 2 + 0], m0
>+ movu [r0 + r1 * 2 + 32], m0
>+ movu [r0 + r2 * 1 + 0], m0
>+ movu [r0 + r2 * 1 + 32], m0
>+ lea r0, [r0 + r1 * 4]
>+ movu [r0 + r1 * 0 + 0], m0
>+ movu [r0 + r1 * 0 + 32], m0
>+ movu [r0 + r1 * 1 + 0], m0
>+ movu [r0 + r1 * 1 + 32], m0
>+ movu [r0 + r1 * 2 + 0], m0
>+ movu [r0 + r1 * 2 + 32], m0
>+ movu [r0 + r2 * 1 + 0], m0
>+ movu [r0 + r2 * 1 + 32], m0
>+ lea r0, [r0 + r1 * 4]
>+ movu [r0 + r1 * 0 + 0], m0
>+ movu [r0 + r1 * 0 + 32], m0
>+ movu [r0 + r1 * 1 + 0], m0
>+ movu [r0 + r1 * 1 + 32], m0
>+ movu [r0 + r1 * 2 + 0], m0
>+ movu [r0 + r1 * 2 + 32], m0
>+ movu [r0 + r2 * 1 + 0], m0
>+ movu [r0 + r2 * 1 + 32], m0
>+ lea r0, [r0 + r1 * 4]
>+ movu [r0 + r1 * 0 + 0], m0
>+ movu [r0 + r1 * 0 + 32], m0
>+ movu [r0 + r1 * 1 + 0], m0
>+ movu [r0 + r1 * 1 + 32], m0
>+ movu [r0 + r1 * 2 + 0], m0
>+ movu [r0 + r1 * 2 + 32], m0
>+ movu [r0 + r2 * 1 + 0], m0
>+ movu [r0 + r2 * 1 + 32], m0
>+ lea r0, [r0 + r1 * 4]
>+ movu [r0 + r1 * 0 + 0], m0
>+ movu [r0 + r1 * 0 + 32], m0
>+ movu [r0 + r1 * 1 + 0], m0
>+ movu [r0 + r1 * 1 + 32], m0
>+ movu [r0 + r1 * 2 + 0], m0
>+ movu [r0 + r1 * 2 + 32], m0
>+ movu [r0 + r2 * 1 + 0], m0
>+ movu [r0 + r2 * 1 + 32], m0
>+ lea r0, [r0 + r1 * 4]
>+ movu [r0 + r1 * 0 + 0], m0
>+ movu [r0 + r1 * 0 + 32], m0
>+ movu [r0 + r1 * 1 + 0], m0
>+ movu [r0 + r1 * 1 + 32], m0
>+ movu [r0 + r1 * 2 + 0], m0
>+ movu [r0 + r1 * 2 + 32], m0
>+ movu [r0 + r2 * 1 + 0], m0
>+ movu [r0 + r2 * 1 + 32], m0
>+ lea r0, [r0 + r1 * 4]
>+ movu [r0 + r1 * 0 + 0], m0
>+ movu [r0 + r1 * 0 + 32], m0
>+ movu [r0 + r1 * 1 + 0], m0
>+ movu [r0 + r1 * 1 + 32], m0
>+ movu [r0 + r1 * 2 + 0], m0
>+ movu [r0 + r1 * 2 + 32], m0
>+ movu [r0 + r2 * 1 + 0], m0
>+ movu [r0 + r2 * 1 + 32], m0
>+ RET
>+
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150505/23ea6418/attachment-0001.html>
More information about the x265-devel
mailing list