[x265] Fwd: [PATCH] asm-avx2: intra_pred_ang8_11

chen chenm003 at 163.com
Wed Mar 11 20:10:22 CET 2015


 
At 2015-03-11 20:34:16,"Praveen Tiwari" <praveen at multicorewareinc.com> wrote:



---------- Forwarded message ----------
From: chen<chenm003 at 163.com>
Date: Wed, Mar 11, 2015 at 2:33 AM
Subject: Re: [x265] [PATCH] asm-avx2: intra_pred_ang8_11
To: Development for x265 <x265-devel at videolan.org>



>>its right now, just a little problem,
>>[trans8_shuf] just use 2 times, buffer into register will same speed with more code size.


   Do you mean instead of,
    mova              m0, [trans8_shuf]
    vpermd            m1, m0, m1
    vpermd            m4, m0, m4


we should use this,
    vpermd            m1, [trans8_shuf], m1
    vpermd            m4, [trans8_shuf], m4


Does the compiler will not use two 'mova' instruction internally rather than just using once? Can be depend on the compiler here for this optimization? Even syntax of 'vpermd' does not allows this.  
 
[MC] in Intel docs, it show "VPERMD ymm1, ymm2, ymm3/m256", so it can't be your instruction format. my comment just to tell you when you use constant just 2 times, you may not buffer it, no modify on this patch

At 2015-03-10 13:58:50,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari <praveen at multicorewareinc.com>
># Date 1425967049 -19800
># Node ID 810995b991eba3f7dcd9014db3b58a6b07723be3
># Parent  f97dfb483647d573cbcab9a4f007ac2aa89c9066
>asm-avx2: intra_pred_ang8_11
>
>diff -r f97dfb483647 -r 810995b991eb source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp	Tue Mar 10 10:49:11 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp	Tue Mar 10 11:27:29 2015 +0530
>@@ -1496,6 +1496,7 @@
>         p.cu[BLOCK_8x8].intra_pred[9] = x265_intra_pred_ang8_9_avx2;
>         p.cu[BLOCK_8x8].intra_pred[27] = x265_intra_pred_ang8_27_avx2;
>         p.cu[BLOCK_8x8].intra_pred[25] = x265_intra_pred_ang8_25_avx2;
>+        p.cu[BLOCK_8x8].intra_pred[11] = x265_intra_pred_ang8_11_avx2;
> 
>         // copy_sp primitives
>         p.cu[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2;
>diff -r f97dfb483647 -r 810995b991eb source/common/x86/intrapred.h
>--- a/source/common/x86/intrapred.h	Tue Mar 10 10:49:11 2015 +0530
>+++ b/source/common/x86/intrapred.h	Tue Mar 10 11:27:29 2015 +0530
>@@ -179,6 +179,7 @@
> void x265_intra_pred_ang8_9_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang8_27_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang8_25_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
>+void x265_intra_pred_ang8_11_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
>diff -r f97dfb483647 -r 810995b991eb source/common/x86/intrapred8.asm
>--- a/source/common/x86/intrapred8.asm	Tue Mar 10 10:49:11 2015 +0530
>+++ b/source/common/x86/intrapred8.asm	Tue Mar 10 11:27:29 2015 +0530
>@@ -10317,3 +10317,47 @@
>     movhps            [r0 + 2 * r1], xm4
>     movhps            [r0 + r3], xm2
>     RET
>+
>+INIT_YMM avx2
>+cglobal intra_pred_ang8_11, 3, 5, 5
>+    mova              m3, [pw_1024]
>+    movu              xm1, [r2 + 16]
>+    pinsrb            xm1, [r2], 0
>+    pshufb            xm1, [intra_pred_shuff_0_8]
>+    vinserti128       m0, m1, xm1, 1
>+
>+    lea               r4, [c_ang8_mode_25]
>+    pmaddubsw         m1, m0, [r4]
>+    pmulhrsw          m1, m3
>+    pmaddubsw         m2, m0, [r4 + mmsize]
>+    pmulhrsw          m2, m3
>+    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
>+    pmulhrsw          m4, m3
>+    pmaddubsw         m0, [r4 + 3 * mmsize]
>+    pmulhrsw          m0, m3
>+    packuswb          m1, m2
>+    packuswb          m4, m0
>+
>+    vperm2i128        m2, m1, m4, 00100000b
>+    vperm2i128        m1, m1, m4, 00110001b
>+    punpcklbw         m4, m2, m1
>+    punpckhbw         m2, m1
>+    punpcklwd         m1, m4, m2
>+    punpckhwd         m4, m2
>+    mova              m0, [trans8_shuf]
>+    vpermd            m1, m0, m1
>+    vpermd            m4, m0, m4
>+
>+    lea               r3, [3 * r1]
>+    movq              [r0], xm1
>+    movhps            [r0 + r1], xm1
>+    vextracti128      xm2, m1, 1
>+    movq              [r0 + 2 * r1], xm2
>+    movhps            [r0 + r3], xm2
>+    lea               r0, [r0 + 4 * r1]
>+    movq              [r0], xm4
>+    movhps            [r0 + r1], xm4
>+    vextracti128      xm2, m4, 1
>+    movq              [r0 + 2 * r1], xm2
>+    movhps            [r0 + r3], xm2
>+    RET
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel

_______________________________________________
x265-devel mailing list
x265-devel at videolan.org
https://mailman.videolan.org/listinfo/x265-devel



-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150312/4bfb8213/attachment.html>


More information about the x265-devel mailing list