[x265] [PATCH 2 of 2] asm: new avx2 algorithm on intra_pred_ang[11], 730c -> 481c
Min Chen
chenm003 at 163.com
Tue Aug 11 03:00:01 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1439250694 25200
# Node ID 0b255136720adbe5784f536ed1f5f0c912258523
# Parent 49938304ae9dc325d414872a62d70248dfa07fad
asm: new avx2 algorithm on intra_pred_ang[11], 730c -> 481c
---
source/common/x86/intrapred8.asm | 154 ++++++++++++++++++++++++++++++-------
source/test/intrapredharness.cpp | 2 +
2 files changed, 127 insertions(+), 29 deletions(-)
diff -r 49938304ae9d -r 0b255136720a source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Mon Aug 10 16:51:31 2015 -0700
+++ b/source/common/x86/intrapred8.asm Mon Aug 10 16:51:34 2015 -0700
@@ -566,6 +566,9 @@
db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
+const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
+ db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8), 8, (32- 6), 6, (32- 4), 4, (32- 2), 2, (32- 0), 0
+
const ang_table
%assign x 0
%rep 32
@@ -13411,36 +13414,129 @@
INTRA_PRED_TRANS_STORE_16x16
RET
-INIT_YMM avx2
-cglobal intra_pred_ang16_11, 3, 5, 12
- mova m11, [pw_1024]
-
- movu xm9, [r2 + 32]
- pinsrb xm9, [r2], 0
- pshufb xm9, [intra_pred_shuff_0_8]
- vinserti128 m9, m9, xm9, 1
-
- vbroadcasti128 m10, [r2 + 8 + 32]
- pshufb m10, [intra_pred_shuff_0_8]
-
- lea r3, [3 * r1]
- lea r4, [c_ang16_mode_11]
-
- INTRA_PRED_ANG16_CAL_ROW m0, m1, 0
- INTRA_PRED_ANG16_CAL_ROW m1, m2, 1
- INTRA_PRED_ANG16_CAL_ROW m2, m3, 2
- INTRA_PRED_ANG16_CAL_ROW m3, m4, 3
-
- add r4, 4 * mmsize
-
- INTRA_PRED_ANG16_CAL_ROW m4, m5, 0
- INTRA_PRED_ANG16_CAL_ROW m5, m6, 1
- INTRA_PRED_ANG16_CAL_ROW m6, m7, 2
- INTRA_PRED_ANG16_CAL_ROW m7, m8, 3
-
- ; transpose and store
- INTRA_PRED_TRANS_STORE_16x16
- RET
+
+INIT_YMM avx2
+cglobal intra_pred_ang16_11, 3,4,5
+ mova m0, [angHor_tab_11]
+ mova m1, [pw_1024]
+ lea r3, [r1 * 3]
+
+ ; prepare for [0 -1 -2 ...]
+ movu xm2, [r2 + 32]
+ ; TODO: input reference pixel buffer need a duplicate of pixel_lt to avoid reduce instruction in every mode
+ pinsrb xm2, [r2], 0
+ pshufb xm2, [intra_pred_shuff_0_8] ; [0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8]
+
+
+ vpbroadcastw m3, xm2 ; word [1 0]
+ psrldq xm2, 2
+ vpbroadcastw m4, xm2 ; word [2 1]
+ psrldq xm2, 2
+ pmaddubsw m3, m0
+ pmaddubsw m4, m0
+ pmulhrsw m3, m1
+ pmulhrsw m4, m1
+ packuswb m3, m4
+ vpermq m3, m3, q3120
+ movu [r0], xm3
+ vextracti128 [r0 + r1], m3, 1
+
+ vpbroadcastw m3, xm2 ; word [3 2]
+ psrldq xm2, 2
+ vpbroadcastw m4, xm2 ; word [4 3]
+ psrldq xm2, 2
+ pmaddubsw m3, m0
+ pmaddubsw m4, m0
+ pmulhrsw m3, m1
+ pmulhrsw m4, m1
+ packuswb m3, m4
+ vpermq m3, m3, q3120
+ movu [r0 + r1 * 2], xm3
+ vextracti128 [r0 + r3], m3, 1
+ lea r0, [r0 + r1 * 4]
+
+ vpbroadcastw m3, xm2 ; word [5 4]
+ psrldq xm2, 2
+ vpbroadcastw m4, xm2 ; word [6 5]
+ psrldq xm2, 2
+ pmaddubsw m3, m0
+ pmaddubsw m4, m0
+ pmulhrsw m3, m1
+ pmulhrsw m4, m1
+ packuswb m3, m4
+ vpermq m3, m3, q3120
+ movu [r0], xm3
+ vextracti128 [r0 + r1], m3, 1
+
+ vpbroadcastw m3, xm2 ; word [7 6]
+ psrldq xm2, 2
+ vpbroadcastw m4, xm2 ; word [8 7]
+ pmaddubsw m3, m0
+ pmaddubsw m4, m0
+ pmulhrsw m3, m1
+ pmulhrsw m4, m1
+ packuswb m3, m4
+ vpermq m3, m3, q3120
+ movu [r0 + r1 * 2], xm3
+ vextracti128 [r0 + r3], m3, 1
+ lea r0, [r0 + r1 * 4]
+
+ ; loading new reference pixels
+ movu xm2, [r2 + 32 + 8]
+ pshufb xm2, [intra_pred_shuff_0_8] ; [8 9 9 A A B B C C D D E E F F 10]
+
+ vpbroadcastw m3, xm2 ; word [9 8]
+ psrldq xm2, 2
+ vpbroadcastw m4, xm2 ; word [A 9]
+ psrldq xm2, 2
+ pmaddubsw m3, m0
+ pmaddubsw m4, m0
+ pmulhrsw m3, m1
+ pmulhrsw m4, m1
+ packuswb m3, m4
+ vpermq m3, m3, q3120
+ movu [r0], xm3
+ vextracti128 [r0 + r1], m3, 1
+
+ vpbroadcastw m3, xm2 ; word [B A]
+ psrldq xm2, 2
+ vpbroadcastw m4, xm2 ; word [C B]
+ psrldq xm2, 2
+ pmaddubsw m3, m0
+ pmaddubsw m4, m0
+ pmulhrsw m3, m1
+ pmulhrsw m4, m1
+ packuswb m3, m4
+ vpermq m3, m3, q3120
+ movu [r0 + r1 * 2], xm3
+ vextracti128 [r0 + r3], m3, 1
+ lea r0, [r0 + r1 * 4]
+
+ vpbroadcastw m3, xm2 ; word [D C]
+ psrldq xm2, 2
+ vpbroadcastw m4, xm2 ; word [E D]
+ psrldq xm2, 2
+ pmaddubsw m3, m0
+ pmaddubsw m4, m0
+ pmulhrsw m3, m1
+ pmulhrsw m4, m1
+ packuswb m3, m4
+ vpermq m3, m3, q3120
+ movu [r0], xm3
+ vextracti128 [r0 + r1], m3, 1
+
+ vpbroadcastw m3, xm2 ; word [F E]
+ psrldq xm2, 2
+ vpbroadcastw m4, xm2 ; word [10 F]
+ pmaddubsw m3, m0
+ pmaddubsw m4, m0
+ pmulhrsw m3, m1
+ pmulhrsw m4, m1
+ packuswb m3, m4
+ vpermq m3, m3, q3120
+ movu [r0 + r1 * 2], xm3
+ vextracti128 [r0 + r3], m3, 1
+ RET
; transpose 8x32 to 16x16, used for intra_ang16x16 avx2 asm
diff -r 49938304ae9d -r 0b255136720a source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp Mon Aug 10 16:51:31 2015 -0700
+++ b/source/test/intrapredharness.cpp Mon Aug 10 16:51:34 2015 -0700
@@ -130,6 +130,8 @@
if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))
{
printf("ang_%dx%d, Mode = %d, Row = %d failed !!\n", width, width, pmode, k);
+ ref[pmode](pixel_out_c, stride, pixel_buff + j, pmode, bFilter);
+ opt[pmode](pixel_out_vec, stride, pixel_buff + j, pmode, bFilter);
return false;
}
}
More information about the x265-devel
mailing list