[x265] [PATCH 6 of 6] asm: 10bpp avx2 code for intra_pred_ang32x32 mode 18, improved 1331c->884c, 31%
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Jun 16 12:34:52 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1434444006 -19800
# Tue Jun 16 14:10:06 2015 +0530
# Node ID a0579532c68b00b2e6a4de667082c4095e4696cf
# Parent 55b27d2b3c2863bab07d872679bdb427aea6a78c
asm: 10bpp avx2 code for intra_pred_ang32x32 mode 18, improved 1331c->884c, 31%
diff -r 55b27d2b3c28 -r a0579532c68b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jun 16 16:00:25 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jun 16 14:10:06 2015 +0530
@@ -1326,6 +1326,7 @@
p.cu[BLOCK_32x32].intra_pred[15] = x265_intra_pred_ang32_15_avx2;
p.cu[BLOCK_32x32].intra_pred[16] = x265_intra_pred_ang32_16_avx2;
p.cu[BLOCK_32x32].intra_pred[17] = x265_intra_pred_ang32_17_avx2;
+ p.cu[BLOCK_32x32].intra_pred[18] = x265_intra_pred_ang32_18_avx2;
p.cu[BLOCK_32x32].intra_pred[19] = x265_intra_pred_ang32_19_avx2;
p.cu[BLOCK_32x32].intra_pred[20] = x265_intra_pred_ang32_20_avx2;
p.cu[BLOCK_32x32].intra_pred[21] = x265_intra_pred_ang32_21_avx2;
diff -r 55b27d2b3c28 -r a0579532c68b source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Tue Jun 16 16:00:25 2015 +0530
+++ b/source/common/x86/intrapred16.asm Tue Jun 16 14:10:06 2015 +0530
@@ -17214,6 +17214,215 @@
mov rsp, [rsp+5*mmsize]
RET
+
+cglobal intra_pred_ang32_18, 3,6,6
+ mov r4, rsp
+ sub rsp, 4*mmsize+gprsize
+ and rsp, ~63
+ mov [rsp+4*mmsize], r4
+
+ movu m0, [r2]
+ movu m1, [r2 + 32]
+ mova [rsp + 2*mmsize], m0
+ mova [rsp + 3*mmsize], m1
+
+ movu m2, [r2 + 130]
+ movu m3, [r2 + 162]
+ pshufb m2, [pw_swap16]
+ pshufb m3, [pw_swap16]
+ vpermq m2, m2, 01001110b
+ vpermq m3, m3, 01001110b
+ mova [rsp + 1*mmsize], m2
+ mova [rsp + 0*mmsize], m3
+
+ add r1d, r1d
+ lea r2, [rsp+2*mmsize]
+ lea r4, [r1 * 2]
+ lea r3, [r1 * 3]
+ lea r5, [r1 * 4]
+
+ movu m0, [r2]
+ movu m1, [r2 + 32]
+ movu m2, [r2 - 16]
+ movu m3, [r2 + 16]
+
+ movu [r0], m0
+ movu [r0 + 32], m1
+
+ palignr m4, m0, m2, 14
+ palignr m5, m1, m3, 14
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 32], m5
+
+ palignr m4, m0, m2, 12
+ palignr m5, m1, m3, 12
+ movu [r0 + r4], m4
+ movu [r0 + r4 + 32], m5
+
+ palignr m4, m0, m2, 10
+ palignr m5, m1, m3, 10
+ movu [r0 + r3], m4
+ movu [r0 + r3 + 32], m5
+
+ add r0, r5
+
+ palignr m4, m0, m2, 8
+ palignr m5, m1, m3, 8
+ movu [r0], m4
+ movu [r0 + 32], m5
+
+ palignr m4, m0, m2, 6
+ palignr m5, m1, m3, 6
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 32], m5
+
+ palignr m4, m0, m2, 4
+ palignr m5, m1, m3, 4
+ movu [r0 + r4], m4
+ movu [r0 + r4 + 32], m5
+
+ palignr m4, m0, m2, 2
+ palignr m5, m1, m3, 2
+ movu [r0 + r3], m4
+ movu [r0 + r3 + 32], m5
+
+ add r0, r5
+
+ movu [r0], m2
+ movu [r0 + 32], m3
+
+ movu m0, [r2 - 32]
+ movu m1, [r2]
+
+ palignr m4, m2, m0, 14
+ palignr m5, m3, m1, 14
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 32], m5
+
+ palignr m4, m2, m0, 12
+ palignr m5, m3, m1, 12
+ movu [r0 + r4], m4
+ movu [r0 + r4 + 32], m5
+
+ palignr m4, m2, m0, 10
+ palignr m5, m3, m1, 10
+ movu [r0 + r3], m4
+ movu [r0 + r3 + 32], m5
+
+ add r0, r5
+
+ palignr m4, m2, m0, 8
+ palignr m5, m3, m1, 8
+ movu [r0], m4
+ movu [r0 + 32], m5
+
+ palignr m4, m2, m0, 6
+ palignr m5, m3, m1, 6
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 32], m5
+
+ palignr m4, m2, m0, 4
+ palignr m5, m3, m1, 4
+ movu [r0 + r4], m4
+ movu [r0 + r4 + 32], m5
+
+ palignr m4, m2, m0, 2
+ palignr m5, m3, m1, 2
+ movu [r0 + r3], m4
+ movu [r0 + r3 + 32], m5
+
+ add r0, r5
+
+ movu [r0], m0
+ movu [r0 + 32], m1
+
+ movu m2, [r2 - 48]
+ movu m3, [r2 - 16]
+
+ palignr m4, m0, m2, 14
+ palignr m5, m1, m3, 14
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 32], m5
+
+ palignr m4, m0, m2, 12
+ palignr m5, m1, m3, 12
+ movu [r0 + r4], m4
+ movu [r0 + r4 + 32], m5
+
+ palignr m4, m0, m2, 10
+ palignr m5, m1, m3, 10
+ movu [r0 + r3], m4
+ movu [r0 + r3 + 32], m5
+
+ add r0, r5
+
+ palignr m4, m0, m2, 8
+ palignr m5, m1, m3, 8
+ movu [r0], m4
+ movu [r0 + 32], m5
+
+ palignr m4, m0, m2, 6
+ palignr m5, m1, m3, 6
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 32], m5
+
+ palignr m4, m0, m2, 4
+ palignr m5, m1, m3, 4
+ movu [r0 + r4], m4
+ movu [r0 + r4 + 32], m5
+
+ palignr m4, m0, m2, 2
+ palignr m5, m1, m3, 2
+ movu [r0 + r3], m4
+ movu [r0 + r3 + 32], m5
+
+ add r0, r5
+
+ movu [r0], m2
+ movu [r0 + 32], m3
+
+ movu m0, [r2 - 64]
+ movu m1, [r2 - 32]
+
+ palignr m4, m2, m0, 14
+ palignr m5, m3, m1, 14
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 32], m5
+
+ palignr m4, m2, m0, 12
+ palignr m5, m3, m1, 12
+ movu [r0 + r4], m4
+ movu [r0 + r4 + 32], m5
+
+ palignr m4, m2, m0, 10
+ palignr m5, m3, m1, 10
+ movu [r0 + r3], m4
+ movu [r0 + r3 + 32], m5
+
+ add r0, r5
+
+ palignr m4, m2, m0, 8
+ palignr m5, m3, m1, 8
+ movu [r0], m4
+ movu [r0 + 32], m5
+
+ palignr m4, m2, m0, 6
+ palignr m5, m3, m1, 6
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 32], m5
+
+ palignr m4, m2, m0, 4
+ palignr m5, m3, m1, 4
+ movu [r0 + r4], m4
+ movu [r0 + r4 + 32], m5
+
+ palignr m4, m2, m0, 2
+ palignr m5, m3, m1, 2
+ movu [r0 + r3], m4
+ movu [r0 + r3 + 32], m5
+
+ mov rsp, [rsp+4*mmsize]
+ RET
;-------------------------------------------------------------------------------------------------------
; end of avx2 code for intra_pred_ang32 mode 2 to 34
;-------------------------------------------------------------------------------------------------------
More information about the x265-devel
mailing list