[x265] [PATCH] asm: new algorithm for intra_ang_32 modes 3 & 33, improved over 50% than previous asm

chen chenm003 at 163.com
Tue Jul 14 16:50:23 CEST 2015




At 2015-07-14 20:59:44,dnyaneshwar at multicorewareinc.com wrote:
># HG changeset patch
># User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
># Date 1436770017 -19800
>#      Mon Jul 13 12:16:57 2015 +0530
># Node ID 7a241bf67fa10f2ddd3c02cd82de5a71d84bbb84
># Parent  8023786c52475484a5dd475254cac67ce65e81df
>asm: new algorithm for intra_ang_32 modes 3 & 33, improved over 50% than previous asm
>
>diff -r 8023786c5247 -r 7a241bf67fa1 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp	Mon Jul 13 17:38:02 2015 -0700
>+++ b/source/common/x86/asm-primitives.cpp	Mon Jul 13 12:16:57 2015 +0530
>@@ -2954,6 +2954,7 @@
>         p.cu[BLOCK_32x32].intra_pred[22] = PFX(intra_pred_ang32_22_avx2);
>         p.cu[BLOCK_32x32].intra_pred[21] = PFX(intra_pred_ang32_21_avx2);
>         p.cu[BLOCK_32x32].intra_pred[18] = PFX(intra_pred_ang32_18_avx2);
>+        p.cu[BLOCK_32x32].intra_pred[3]  = PFX(intra_pred_ang32_3_avx2);
> 
>         // all_angs primitives
>         p.cu[BLOCK_4x4].intra_pred_allangs = PFX(all_angs_pred_4x4_avx2);
>diff -r 8023786c5247 -r 7a241bf67fa1 source/common/x86/intrapred8.asm
>--- a/source/common/x86/intrapred8.asm	Mon Jul 13 17:38:02 2015 -0700
>+++ b/source/common/x86/intrapred8.asm	Mon Jul 13 12:16:57 2015 +0530
>@@ -480,38 +480,6 @@
>                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
>                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
> 
>-
>-ALIGN 32
>-c_ang32_mode_33:   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
>-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
>-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
>-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
>-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
>-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
>-                   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
>-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
>-                   db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
>-                   db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
>-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
>-                   db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
>-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
>-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
>-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
>-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
>-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
>-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
>-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
>-                   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
>-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
>-                   db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
>-                   db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
>-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
>-                   db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
>-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
>-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
>-
>-
>-
> ALIGN 32
> c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
>                    db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
>@@ -530,8 +498,6 @@
>                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
>                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
> 
>-
>-
> ALIGN 32
> c_ang32_mode_24:   db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
>                    db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
>@@ -699,6 +665,13 @@
> %assign x x+1
> %endrep
> 
>+const ang_table_avx2
>+%assign x 0
>+%rep 32
>+    times 16 db (32-x), x
>+%assign x x+1
>+%endrep
>+
> const pw_ang_table
> %assign x 0
> %rep 32
>@@ -11791,6 +11764,304 @@
>     jnz        .loop
>     RET
> 
>+;-----------------------------------------------------------------------------------------
>+; start of intra_pred_ang32 angular modes avx2 asm
>+;-----------------------------------------------------------------------------------------
>+
>+%if ARCH_X86_64 == 1
>+INIT_YMM avx2
>+
>+; register mapping :
>+; %1-%8 - output registers
>+; %9    - temp register
>+; %10   - for label naming
>+%macro TRANSPOSE_32x8_AVX2 10
>+    jnz         .skip%10
>+
>+    ; transpose 8x32 to 32x8 and then store
>+    punpcklbw   m%9, m%1, m%2
>+    punpckhbw   m%1, m%2
>+    punpcklbw   m%2, m%3, m%4
>+    punpckhbw   m%3, m%4
>+    punpcklbw   m%4, m%5, m%6
>+    punpckhbw   m%5, m%6
>+    punpcklbw   m%6, m%7, m%8
>+    punpckhbw   m%7, m%8
>+
>+    punpcklwd   m%8, m%9, m%2
>+    punpckhwd   m%9, m%2
>+    punpcklwd   m%2, m%4, m%6
>+    punpckhwd   m%4, m%6
>+    punpcklwd   m%6, m%1, m%3
>+    punpckhwd   m%1, m%3
>+    punpcklwd   m%3, m%5, m%7
>+    punpckhwd   m%5, m%7
>+
>+    punpckldq   m%7, m%8, m%2
>+    punpckhdq   m%8, m%2
>+    punpckldq   m%2, m%6, m%3
>+    punpckhdq   m%6, m%3
>+    punpckldq   m%3, m%9, m%4
>+    punpckhdq   m%9, m%4
>+    punpckldq   m%4, m%1, m%5
>+    punpckhdq   m%1, m%5
>+
>+    movq        [r0 + r1 * 0], xm%7
>+    movhps      [r0 + r1 * 1], xm%7
>+    movq        [r0 + r1 * 2], xm%8
>+    movhps      [r0 + r5 * 1], xm%8
>+
>+    lea         r0, [r0 + r6]
>+
>+    movq        [r0 + r1 * 0], xm%3
>+    movhps      [r0 + r1 * 1], xm%3
>+    movq        [r0 + r1 * 2], xm%9
>+    movhps      [r0 + r5 * 1], xm%9
>+
>+    lea         r0, [r0 + r6]
>+
>+    movq        [r0 + r1 * 0], xm%2
>+    movhps      [r0 + r1 * 1], xm%2
>+    movq        [r0 + r1 * 2], xm%6
>+    movhps      [r0 + r5 * 1], xm%6
>+
>+    lea         r0, [r0 + r6]
>+
>+    movq        [r0 + r1 * 0], xm%4
>+    movhps      [r0 + r1 * 1], xm%4
>+    movq        [r0 + r1 * 2], xm%1
>+    movhps      [r0 + r5 * 1], xm%1
>+
>+    lea         r0, [r0 + r6]
>+
>+    vpermq      m%8, m%8, 00001110b
>+    vpermq      m%7, m%7, 00001110b
>+    vpermq      m%6, m%6, 00001110b
>+    vpermq      m%3, m%3, 00001110b
>+    vpermq      m%9, m%9, 00001110b
>+    vpermq      m%2, m%2, 00001110b
>+    vpermq      m%4, m%4, 00001110b
>+    vpermq      m%1, m%1, 00001110b
what's the different to VEXTRACTI128?

>+
>+    movq        [r0 + r1 * 0], xm%7
>+    movhps      [r0 + r1 * 1], xm%7
>+    movq        [r0 + r1 * 2], xm%8
>+    movhps      [r0 + r5 * 1], xm%8
>+
>+    lea         r0, [r0 + r6]
>+
>+    movq        [r0 + r1 * 0], xm%3
>+    movhps      [r0 + r1 * 1], xm%3
>+    movq        [r0 + r1 * 2], xm%9
>+    movhps      [r0 + r5 * 1], xm%9
>+
>+    lea         r0, [r0 + r6]
>+
>+    movq        [r0 + r1 * 0], xm%2
>+    movhps      [r0 + r1 * 1], xm%2
>+    movq        [r0 + r1 * 2], xm%6
>+    movhps      [r0 + r5 * 1], xm%6
>+
>+    lea         r0, [r0 + r6]
why not ADD?

>+
>+    movq        [r0 + r1 * 0], xm%4
>+    movhps      [r0 + r1 * 1], xm%4
>+    movq        [r0 + r1 * 2], xm%1
>+    movhps      [r0 + r5 * 1], xm%1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150714/b259cb9c/attachment-0001.html>


More information about the x265-devel mailing list