[x265] Fwd: [PATCH] asm-avx2: intra_pred_ang8_11
chen
chenm003 at 163.com
Wed Mar 11 20:10:22 CET 2015
At 2015-03-11 20:34:16,"Praveen Tiwari" <praveen at multicorewareinc.com> wrote:
---------- Forwarded message ----------
From: chen<chenm003 at 163.com>
Date: Wed, Mar 11, 2015 at 2:33 AM
Subject: Re: [x265] [PATCH] asm-avx2: intra_pred_ang8_11
To: Development for x265 <x265-devel at videolan.org>
>>its right now, just a little problem,
>>[trans8_shuf] just use 2 times, buffer into register will same speed with more code size.
Do you mean instead of,
mova m0, [trans8_shuf]
vpermd m1, m0, m1
vpermd m4, m0, m4
we should use this,
vpermd m1, [trans8_shuf], m1
vpermd m4, [trans8_shuf], m4
Does the compiler will not use two 'mova' instruction internally rather than just using once? Can be depend on the compiler here for this optimization? Even syntax of 'vpermd' does not allows this.
[MC] in Intel docs, it show "VPERMD ymm1, ymm2, ymm3/m256", so it can't be your instruction format. my comment just to tell you when you use constant just 2 times, you may not buffer it, no modify on this patch
At 2015-03-10 13:58:50,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari <praveen at multicorewareinc.com>
># Date 1425967049 -19800
># Node ID 810995b991eba3f7dcd9014db3b58a6b07723be3
># Parent f97dfb483647d573cbcab9a4f007ac2aa89c9066
>asm-avx2: intra_pred_ang8_11
>
>diff -r f97dfb483647 -r 810995b991eb source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Tue Mar 10 10:49:11 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp Tue Mar 10 11:27:29 2015 +0530
>@@ -1496,6 +1496,7 @@
> p.cu[BLOCK_8x8].intra_pred[9] = x265_intra_pred_ang8_9_avx2;
> p.cu[BLOCK_8x8].intra_pred[27] = x265_intra_pred_ang8_27_avx2;
> p.cu[BLOCK_8x8].intra_pred[25] = x265_intra_pred_ang8_25_avx2;
>+ p.cu[BLOCK_8x8].intra_pred[11] = x265_intra_pred_ang8_11_avx2;
>
> // copy_sp primitives
> p.cu[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2;
>diff -r f97dfb483647 -r 810995b991eb source/common/x86/intrapred.h
>--- a/source/common/x86/intrapred.h Tue Mar 10 10:49:11 2015 +0530
>+++ b/source/common/x86/intrapred.h Tue Mar 10 11:27:29 2015 +0530
>@@ -179,6 +179,7 @@
> void x265_intra_pred_ang8_9_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang8_27_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang8_25_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
>+void x265_intra_pred_ang8_11_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
>diff -r f97dfb483647 -r 810995b991eb source/common/x86/intrapred8.asm
>--- a/source/common/x86/intrapred8.asm Tue Mar 10 10:49:11 2015 +0530
>+++ b/source/common/x86/intrapred8.asm Tue Mar 10 11:27:29 2015 +0530
>@@ -10317,3 +10317,47 @@
> movhps [r0 + 2 * r1], xm4
> movhps [r0 + r3], xm2
> RET
>+
>+INIT_YMM avx2
>+cglobal intra_pred_ang8_11, 3, 5, 5
>+ mova m3, [pw_1024]
>+ movu xm1, [r2 + 16]
>+ pinsrb xm1, [r2], 0
>+ pshufb xm1, [intra_pred_shuff_0_8]
>+ vinserti128 m0, m1, xm1, 1
>+
>+ lea r4, [c_ang8_mode_25]
>+ pmaddubsw m1, m0, [r4]
>+ pmulhrsw m1, m3
>+ pmaddubsw m2, m0, [r4 + mmsize]
>+ pmulhrsw m2, m3
>+ pmaddubsw m4, m0, [r4 + 2 * mmsize]
>+ pmulhrsw m4, m3
>+ pmaddubsw m0, [r4 + 3 * mmsize]
>+ pmulhrsw m0, m3
>+ packuswb m1, m2
>+ packuswb m4, m0
>+
>+ vperm2i128 m2, m1, m4, 00100000b
>+ vperm2i128 m1, m1, m4, 00110001b
>+ punpcklbw m4, m2, m1
>+ punpckhbw m2, m1
>+ punpcklwd m1, m4, m2
>+ punpckhwd m4, m2
>+ mova m0, [trans8_shuf]
>+ vpermd m1, m0, m1
>+ vpermd m4, m0, m4
>+
>+ lea r3, [3 * r1]
>+ movq [r0], xm1
>+ movhps [r0 + r1], xm1
>+ vextracti128 xm2, m1, 1
>+ movq [r0 + 2 * r1], xm2
>+ movhps [r0 + r3], xm2
>+ lea r0, [r0 + 4 * r1]
>+ movq [r0], xm4
>+ movhps [r0 + r1], xm4
>+ vextracti128 xm2, m4, 1
>+ movq [r0 + 2 * r1], xm2
>+ movhps [r0 + r3], xm2
>+ RET
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________
x265-devel mailing list
x265-devel at videolan.org
https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150312/4bfb8213/attachment.html>
More information about the x265-devel
mailing list