[x265] [PATCH] asm: intra_pred_ang32_18 improved by ~44% over SSE4
chen
chenm003 at 163.com
Mon Apr 13 15:39:06 CEST 2015
At 2015-04-13 21:36:19,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari <praveen at multicorewareinc.com>
># Date 1428917176 -19800
># Mon Apr 13 14:56:16 2015 +0530
># Node ID f4310212b0745d51d0cc5ed8b2a3098e1bcea016
># Parent 4cccf22b00ee188a72c8dc3896d7dc1613d855ad
>asm: intra_pred_ang32_18 improved by ~44% over SSE4
>
>AVX2:
>intra_ang_32x32[18] 31.25x 363.88 11371.31
>
>SSE4:
>intra_ang_32x32[18] 18.11x 648.61 11743.52
>
>diff -r 4cccf22b00ee -r f4310212b074 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Fri Apr 10 18:15:38 2015 -0500
>+++ b/source/common/x86/asm-primitives.cpp Mon Apr 13 14:56:16 2015 +0530
>@@ -1819,6 +1819,7 @@
> p.cu[BLOCK_32x32].intra_pred[23] = x265_intra_pred_ang32_23_avx2;
> p.cu[BLOCK_32x32].intra_pred[22] = x265_intra_pred_ang32_22_avx2;
> p.cu[BLOCK_32x32].intra_pred[21] = x265_intra_pred_ang32_21_avx2;
>+ p.cu[BLOCK_32x32].intra_pred[18] = x265_intra_pred_ang32_18_avx2;
>
> // copy_sp primitives
> p.cu[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2;
>diff -r 4cccf22b00ee -r f4310212b074 source/common/x86/intrapred.h
>--- a/source/common/x86/intrapred.h Fri Apr 10 18:15:38 2015 -0500
>+++ b/source/common/x86/intrapred.h Mon Apr 13 14:56:16 2015 +0530
>@@ -277,6 +277,7 @@
> void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
>+void x265_intra_pred_ang32_18_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
>diff -r 4cccf22b00ee -r f4310212b074 source/common/x86/intrapred8.asm
>--- a/source/common/x86/intrapred8.asm Fri Apr 10 18:15:38 2015 -0500
>+++ b/source/common/x86/intrapred8.asm Mon Apr 13 14:56:16 2015 +0530
>@@ -28,6 +28,7 @@
> SECTION_RODATA 32
>
> intra_pred_shuff_0_8: times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
>+intra_pred_shuff_15_0: times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
>
> pb_0_8 times 8 db 0, 8
> pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8
>@@ -10366,6 +10367,101 @@
>
> RET
>
>+INIT_YMM avx2
>+cglobal intra_pred_ang32_18, 4, 6, 3
>+ movu m0, [r2]
>+ movu xm1, [r2 + 1 + 64]
>+ pshufb xm1, [intra_pred_shuff_15_0]
>+ movu xm2, xm0
mova
>+ vinserti128 m1, m1, xm2, 1
>+
>+ lea r4, [r1 * 2]
*2?
>+ lea r3, [r1 * 3]
>+ lea r5, [r1 * 4]
*4?
>+
>+ movu [r0], m0
>+ palignr m2, m0, m1, 15
>+ movu [r0 + r1], m2
>+ palignr m2, m0, m1, 14
>+ movu [r0 + r4], m2
>+ palignr m2, m0, m1, 13
>+ movu [r0 + r3], m2
>+
>+ lea r0, [r0 + r5]
>+ palignr m2, m0, m1, 12
>+ movu [r0], m2
>+ palignr m2, m0, m1, 11
>+ movu [r0 + r1], m2
>+ palignr m2, m0, m1, 10
>+ movu [r0 + r4], m2
>+ palignr m2, m0, m1, 9
>+ movu [r0 + r3], m2
>+
>+ lea r0, [r0 + r5]
add or [r0+r1*4]
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150413/87f9e79c/attachment-0001.html>
More information about the x265-devel
mailing list