[x265] [PATCH] asm: new algorithm for intra_ang_32 modes 3 & 33, improved over 50% than previous asm
chen
chenm003 at 163.com
Tue Jul 14 16:50:23 CEST 2015
At 2015-07-14 20:59:44,dnyaneshwar at multicorewareinc.com wrote:
># HG changeset patch
># User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
># Date 1436770017 -19800
># Mon Jul 13 12:16:57 2015 +0530
># Node ID 7a241bf67fa10f2ddd3c02cd82de5a71d84bbb84
># Parent 8023786c52475484a5dd475254cac67ce65e81df
>asm: new algorithm for intra_ang_32 modes 3 & 33, improved over 50% than previous asm
>
>diff -r 8023786c5247 -r 7a241bf67fa1 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Mon Jul 13 17:38:02 2015 -0700
>+++ b/source/common/x86/asm-primitives.cpp Mon Jul 13 12:16:57 2015 +0530
>@@ -2954,6 +2954,7 @@
> p.cu[BLOCK_32x32].intra_pred[22] = PFX(intra_pred_ang32_22_avx2);
> p.cu[BLOCK_32x32].intra_pred[21] = PFX(intra_pred_ang32_21_avx2);
> p.cu[BLOCK_32x32].intra_pred[18] = PFX(intra_pred_ang32_18_avx2);
>+ p.cu[BLOCK_32x32].intra_pred[3] = PFX(intra_pred_ang32_3_avx2);
>
> // all_angs primitives
> p.cu[BLOCK_4x4].intra_pred_allangs = PFX(all_angs_pred_4x4_avx2);
>diff -r 8023786c5247 -r 7a241bf67fa1 source/common/x86/intrapred8.asm
>--- a/source/common/x86/intrapred8.asm Mon Jul 13 17:38:02 2015 -0700
>+++ b/source/common/x86/intrapred8.asm Mon Jul 13 12:16:57 2015 +0530
>@@ -480,38 +480,6 @@
> db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
> db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
>
>-
>-ALIGN 32
>-c_ang32_mode_33: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
>- db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
>- db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
>- db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
>- db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
>- db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
>- db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
>- db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
>- db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
>- db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
>- db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
>- db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
>- db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
>- db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
>- db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
>- db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
>- db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
>- db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
>- db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
>- db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
>- db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
>- db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
>- db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
>- db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
>- db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
>- db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
>- db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
>-
>-
>-
> ALIGN 32
> c_ang32_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
> db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
>@@ -530,8 +498,6 @@
> db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
> db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
>
>-
>-
> ALIGN 32
> c_ang32_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
> db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
>@@ -699,6 +665,13 @@
> %assign x x+1
> %endrep
>
>+const ang_table_avx2
>+%assign x 0
>+%rep 32
>+ times 16 db (32-x), x
>+%assign x x+1
>+%endrep
>+
> const pw_ang_table
> %assign x 0
> %rep 32
>@@ -11791,6 +11764,304 @@
> jnz .loop
> RET
>
>+;-----------------------------------------------------------------------------------------
>+; start of intra_pred_ang32 angular modes avx2 asm
>+;-----------------------------------------------------------------------------------------
>+
>+%if ARCH_X86_64 == 1
>+INIT_YMM avx2
>+
>+; register mapping :
>+; %1-%8 - output registers
>+; %9 - temp register
>+; %10 - for label naming
>+%macro TRANSPOSE_32x8_AVX2 10
>+ jnz .skip%10
>+
>+ ; transpose 8x32 to 32x8 and then store
>+ punpcklbw m%9, m%1, m%2
>+ punpckhbw m%1, m%2
>+ punpcklbw m%2, m%3, m%4
>+ punpckhbw m%3, m%4
>+ punpcklbw m%4, m%5, m%6
>+ punpckhbw m%5, m%6
>+ punpcklbw m%6, m%7, m%8
>+ punpckhbw m%7, m%8
>+
>+ punpcklwd m%8, m%9, m%2
>+ punpckhwd m%9, m%2
>+ punpcklwd m%2, m%4, m%6
>+ punpckhwd m%4, m%6
>+ punpcklwd m%6, m%1, m%3
>+ punpckhwd m%1, m%3
>+ punpcklwd m%3, m%5, m%7
>+ punpckhwd m%5, m%7
>+
>+ punpckldq m%7, m%8, m%2
>+ punpckhdq m%8, m%2
>+ punpckldq m%2, m%6, m%3
>+ punpckhdq m%6, m%3
>+ punpckldq m%3, m%9, m%4
>+ punpckhdq m%9, m%4
>+ punpckldq m%4, m%1, m%5
>+ punpckhdq m%1, m%5
>+
>+ movq [r0 + r1 * 0], xm%7
>+ movhps [r0 + r1 * 1], xm%7
>+ movq [r0 + r1 * 2], xm%8
>+ movhps [r0 + r5 * 1], xm%8
>+
>+ lea r0, [r0 + r6]
>+
>+ movq [r0 + r1 * 0], xm%3
>+ movhps [r0 + r1 * 1], xm%3
>+ movq [r0 + r1 * 2], xm%9
>+ movhps [r0 + r5 * 1], xm%9
>+
>+ lea r0, [r0 + r6]
>+
>+ movq [r0 + r1 * 0], xm%2
>+ movhps [r0 + r1 * 1], xm%2
>+ movq [r0 + r1 * 2], xm%6
>+ movhps [r0 + r5 * 1], xm%6
>+
>+ lea r0, [r0 + r6]
>+
>+ movq [r0 + r1 * 0], xm%4
>+ movhps [r0 + r1 * 1], xm%4
>+ movq [r0 + r1 * 2], xm%1
>+ movhps [r0 + r5 * 1], xm%1
>+
>+ lea r0, [r0 + r6]
>+
>+ vpermq m%8, m%8, 00001110b
>+ vpermq m%7, m%7, 00001110b
>+ vpermq m%6, m%6, 00001110b
>+ vpermq m%3, m%3, 00001110b
>+ vpermq m%9, m%9, 00001110b
>+ vpermq m%2, m%2, 00001110b
>+ vpermq m%4, m%4, 00001110b
>+ vpermq m%1, m%1, 00001110b
what's the different to VEXTRACTI128?
>+
>+ movq [r0 + r1 * 0], xm%7
>+ movhps [r0 + r1 * 1], xm%7
>+ movq [r0 + r1 * 2], xm%8
>+ movhps [r0 + r5 * 1], xm%8
>+
>+ lea r0, [r0 + r6]
>+
>+ movq [r0 + r1 * 0], xm%3
>+ movhps [r0 + r1 * 1], xm%3
>+ movq [r0 + r1 * 2], xm%9
>+ movhps [r0 + r5 * 1], xm%9
>+
>+ lea r0, [r0 + r6]
>+
>+ movq [r0 + r1 * 0], xm%2
>+ movhps [r0 + r1 * 1], xm%2
>+ movq [r0 + r1 * 2], xm%6
>+ movhps [r0 + r5 * 1], xm%6
>+
>+ lea r0, [r0 + r6]
why not ADD?
>+
>+ movq [r0 + r1 * 0], xm%4
>+ movhps [r0 + r1 * 1], xm%4
>+ movq [r0 + r1 * 2], xm%1
>+ movhps [r0 + r5 * 1], xm%1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150714/b259cb9c/attachment-0001.html>
More information about the x265-devel
mailing list