[x265] [PATCH] asm: avx2 code for intra_pred_ang32x32 mode 2, 3, 33 & 34
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Jun 9 10:32:13 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1433833289 -19800
# Tue Jun 09 12:31:29 2015 +0530
# Node ID 05df60c805d3f423db573885eb9f27b17dbc12a7
# Parent b252468dde7ffca57da27575388d95ce538945d2
asm: avx2 code for intra_pred_ang32x32 mode 2,3,33 & 34
performance improvement over SSE:
intra_ang_32x32[ 2] 1382c->669c, 51%
intra_ang_32x32[ 3] 9887c->4088c, 58%
intra_ang_32x32[33] 7021c->2773c, 60%
intra_ang_32x32[34] 1278c->670c, 47%
diff -r b252468dde7f -r 05df60c805d3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jun 09 12:31:29 2015 +0530
@@ -1256,6 +1256,11 @@
p.cu[BLOCK_16x16].intra_pred[33] = x265_intra_pred_ang16_33_avx2;
p.cu[BLOCK_16x16].intra_pred[34] = x265_intra_pred_ang16_2_avx2;
+ p.cu[BLOCK_32x32].intra_pred[2] = x265_intra_pred_ang32_2_avx2;
+ p.cu[BLOCK_32x32].intra_pred[3] = x265_intra_pred_ang32_3_avx2;
+ p.cu[BLOCK_32x32].intra_pred[33] = x265_intra_pred_ang32_33_avx2;
+ p.cu[BLOCK_32x32].intra_pred[34] = x265_intra_pred_ang32_2_avx2;
+
p.pu[LUMA_8x4].addAvg = x265_addAvg_8x4_avx2;
p.pu[LUMA_8x8].addAvg = x265_addAvg_8x8_avx2;
p.pu[LUMA_8x16].addAvg = x265_addAvg_8x16_avx2;
diff -r b252468dde7f -r 05df60c805d3 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/x86/intrapred.h Tue Jun 09 12:31:29 2015 +0530
@@ -276,6 +276,7 @@
void x265_intra_pred_ang16_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang32_34_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang32_2_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang32_3_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang32_26_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang32_27_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang32_28_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
diff -r b252468dde7f -r 05df60c805d3 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/x86/intrapred16.asm Tue Jun 09 12:31:29 2015 +0530
@@ -12578,16 +12578,6 @@
call ang16_mode_3_33
RET
-cglobal intra_pred_ang32_3, 3,7,13
- add r2, 128
- xor r6d, r6d
- lea r3, [ang_table_avx2 + 16 * 32]
- add r1d, r1d
- lea r4, [r1 * 3]
-
- call ang16_mode_3_33
- RET
-
cglobal intra_pred_ang16_33, 3,7,13
xor r6d, r6d
inc r6d
@@ -13146,6 +13136,218 @@
; end of avx2 code for intra_pred_ang16 mode 2 to 34
;-------------------------------------------------------------------------------------------------------
+;-------------------------------------------------------------------------------------------------------
+; avx2 code for intra_pred_ang32 mode 2 to 34 start
+;-------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal intra_pred_ang32_2, 3,5,6
+ lea r4, [r2]
+ add r2, 128
+ cmp r3m, byte 34
+ cmove r2, r4
+ add r1d, r1d
+ lea r3, [r1 * 3]
+ movu m0, [r2 + 4]
+ movu m1, [r2 + 20]
+ movu m3, [r2 + 36]
+ movu m4, [r2 + 52]
+
+ movu [r0], m0
+ movu [r0 + 32], m3
+ palignr m2, m1, m0, 2
+ palignr m5, m4, m3, 2
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 32], m5
+ palignr m2, m1, m0, 4
+ palignr m5, m4, m3, 4
+ movu [r0 + r1 * 2], m2
+ movu [r0 + r1 * 2 + 32], m5
+ palignr m2, m1, m0, 6
+ palignr m5, m4, m3, 6
+ movu [r0 + r3], m2
+ movu [r0 + r3 + 32], m5
+
+ lea r0, [r0 + r1 * 4]
+ palignr m2, m1, m0, 8
+ palignr m5, m4, m3, 8
+ movu [r0], m2
+ movu [r0 + 32], m5
+ palignr m2, m1, m0, 10
+ palignr m5, m4, m3, 10
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 32], m5
+ palignr m2, m1, m0, 12
+ palignr m5, m4, m3, 12
+ movu [r0 + r1 * 2], m2
+ movu [r0 + r1 * 2 + 32], m5
+ palignr m2, m1, m0, 14
+ palignr m5, m4, m3, 14
+ movu [r0 + r3], m2
+ movu [r0 + r3 + 32], m5
+
+ movu m0, [r2 + 36]
+ movu m3, [r2 + 68]
+ lea r0, [r0 + r1 * 4]
+ movu [r0], m1
+ movu [r0 + 32], m4
+ palignr m2, m0, m1, 2
+ palignr m5, m3, m4, 2
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 32], m5
+ palignr m2, m0, m1, 4
+ palignr m5, m3, m4, 4
+ movu [r0 + r1 * 2], m2
+ movu [r0 + r1 * 2 + 32], m5
+ palignr m2, m0, m1, 6
+ palignr m5, m3, m4, 6
+ movu [r0 + r3], m2
+ movu [r0 + r3 + 32], m5
+
+ lea r0, [r0 + r1 * 4]
+ palignr m2, m0, m1, 8
+ palignr m5, m3, m4, 8
+ movu [r0], m2
+ movu [r0 + 32], m5
+ palignr m2, m0, m1, 10
+ palignr m5, m3, m4, 10
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 32], m5
+ palignr m2, m0, m1, 12
+ palignr m5, m3, m4, 12
+ movu [r0 + r1 * 2], m2
+ movu [r0 + r1 * 2 + 32], m5
+ palignr m2, m0, m1, 14
+ palignr m5, m3, m4, 14
+ movu [r0 + r3], m2
+ movu [r0 + r3 + 32], m5
+
+ lea r0, [r0 + r1 * 4]
+ movu m1, [r2 + 52]
+ movu m4, [r2 + 84]
+
+ movu [r0], m0
+ movu [r0 + 32], m3
+ palignr m2, m1, m0, 2
+ palignr m5, m4, m3, 2
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 32], m5
+ palignr m2, m1, m0, 4
+ palignr m5, m4, m3, 4
+ movu [r0 + r1 * 2], m2
+ movu [r0 + r1 * 2 + 32], m5
+ palignr m2, m1, m0, 6
+ palignr m5, m4, m3, 6
+ movu [r0 + r3], m2
+ movu [r0 + r3 + 32], m5
+
+ lea r0, [r0 + r1 * 4]
+ palignr m2, m1, m0, 8
+ palignr m5, m4, m3, 8
+ movu [r0], m2
+ movu [r0 + 32], m5
+ palignr m2, m1, m0, 10
+ palignr m5, m4, m3, 10
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 32], m5
+ palignr m2, m1, m0, 12
+ palignr m5, m4, m3, 12
+ movu [r0 + r1 * 2], m2
+ movu [r0 + r1 * 2 + 32], m5
+ palignr m2, m1, m0, 14
+ palignr m5, m4, m3, 14
+ movu [r0 + r3], m2
+ movu [r0 + r3 + 32], m5
+
+ movu m0, [r2 + 68]
+ movu m3, [r2 + 100]
+ lea r0, [r0 + r1 * 4]
+ movu [r0], m1
+ movu [r0 + 32], m4
+ palignr m2, m0, m1, 2
+ palignr m5, m3, m4, 2
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 32], m5
+ palignr m2, m0, m1, 4
+ palignr m5, m3, m4, 4
+ movu [r0 + r1 * 2], m2
+ movu [r0 + r1 * 2 + 32], m5
+ palignr m2, m0, m1, 6
+ palignr m5, m3, m4, 6
+ movu [r0 + r3], m2
+ movu [r0 + r3 + 32], m5
+
+ lea r0, [r0 + r1 * 4]
+ palignr m2, m0, m1, 8
+ palignr m5, m3, m4, 8
+ movu [r0], m2
+ movu [r0 + 32], m5
+ palignr m2, m0, m1, 10
+ palignr m5, m3, m4, 10
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 32], m5
+ palignr m2, m0, m1, 12
+ palignr m5, m3, m4, 12
+ movu [r0 + r1 * 2], m2
+ movu [r0 + r1 * 2 + 32], m5
+ palignr m2, m0, m1, 14
+ palignr m5, m3, m4, 14
+ movu [r0 + r3], m2
+ movu [r0 + r3 + 32], m5
+ RET
+
+cglobal intra_pred_ang32_3, 3,8,13
+ add r2, 128
+ xor r6d, r6d
+ lea r3, [ang_table_avx2 + 16 * 32]
+ add r1d, r1d
+ lea r4, [r1 * 3]
+ lea r7, [r0 + 8 * r1]
+
+ call ang16_mode_3_33
+
+ add r2, 26
+ lea r0, [r0 + 32]
+
+ call ang16_mode_3_33
+
+ add r2, 6
+ lea r0, [r7 + 8 * r1]
+
+ call ang16_mode_3_33
+
+ add r2, 26
+ lea r0, [r0 + 32]
+
+ call ang16_mode_3_33
+ RET
+
+cglobal intra_pred_ang32_33, 3,7,13
+ xor r6d, r6d
+ inc r6d
+ lea r3, [ang_table_avx2 + 16 * 32]
+ add r1d, r1d
+ lea r4, [r1 * 3]
+ lea r5, [r0 + 32]
+
+ call ang16_mode_3_33
+
+ add r2, 26
+
+ call ang16_mode_3_33
+
+ add r2, 6
+ mov r0, r5
+
+ call ang16_mode_3_33
+
+ add r2, 26
+
+ call ang16_mode_3_33
+ RET
+;-------------------------------------------------------------------------------------------------------
+; end of avx2 code for intra_pred_ang32 mode 2 to 34
+;-------------------------------------------------------------------------------------------------------
+
%macro MODE_2_34 0
movu m0, [r2 + 4]
movu m1, [r2 + 20]
More information about the x265-devel
mailing list