[x265] [PATCH 4 of 7] asm: intra_pred_ang16_8 improved by ~28% over SSE4
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Apr 7 14:56:49 CEST 2015
# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1428404532 -19800
# Tue Apr 07 16:32:12 2015 +0530
# Node ID 255b6935884a682ed6b8c8c006588d107aca1dcb
# Parent 9ada8e80cff7bd184a70af4dbb87c0b3a704d59d
asm: intra_pred_ang16_8 improved by ~28% over SSE4
AVX2:
intra_ang_16x16[ 8] 14.70x 792.85 11653.86
SSE4:
intra_ang_16x16[ 8] 11.28x 1014.29 11441.50
diff -r 9ada8e80cff7 -r 255b6935884a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Apr 07 13:35:25 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Apr 07 16:32:12 2015 +0530
@@ -1761,6 +1761,7 @@
p.cu[BLOCK_8x8].intra_pred[12] = x265_intra_pred_ang8_12_avx2;
p.cu[BLOCK_8x8].intra_pred[24] = x265_intra_pred_ang8_24_avx2;
p.cu[BLOCK_8x8].intra_pred[11] = x265_intra_pred_ang8_11_avx2;
+ p.cu[BLOCK_16x16].intra_pred[8] = x265_intra_pred_ang16_8_avx2;
p.cu[BLOCK_16x16].intra_pred[9] = x265_intra_pred_ang16_9_avx2;
p.cu[BLOCK_16x16].intra_pred[11] = x265_intra_pred_ang16_11_avx2;
p.cu[BLOCK_16x16].intra_pred[25] = x265_intra_pred_ang16_25_avx2;
diff -r 9ada8e80cff7 -r 255b6935884a source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Tue Apr 07 13:35:25 2015 +0530
+++ b/source/common/x86/intrapred.h Tue Apr 07 16:32:12 2015 +0530
@@ -233,6 +233,7 @@
void x265_intra_pred_ang8_12_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang8_24_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang8_11_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang16_8_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang16_9_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang16_11_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang16_25_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
diff -r 9ada8e80cff7 -r 255b6935884a source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Tue Apr 07 13:35:25 2015 +0530
+++ b/source/common/x86/intrapred8.asm Tue Apr 07 16:32:12 2015 +0530
@@ -167,6 +167,15 @@
ALIGN 32
intra_pred_shuff_0_15: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 15
+ALIGN 32
+c_ang16_mode_8: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
+ db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
+ db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
+ db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
+ db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
+ db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
+ db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
+ db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 32
c_ang16_mode_29: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
@@ -12067,6 +12076,92 @@
INTRA_PRED_TRANS_STORE_16x16
RET
+
+INIT_YMM avx2
+cglobal intra_pred_ang16_8, 3, 5, 12
+ mova m11, [pw_1024]
+
+ movu xm9, [r2 + 1 + 32]
+ pshufb xm9, [intra_pred_shuff_0_8]
+ movu xm10, [r2 + 9 + 32]
+ pshufb xm10, [intra_pred_shuff_0_8]
+
+ movu xm7, [r2 + 2 + 32]
+ pshufb xm7, [intra_pred_shuff_0_8]
+ vinserti128 m9, m9, xm7, 1
+
+ movu xm8, [r2 + 10 + 32]
+ pshufb xm8, [intra_pred_shuff_0_8]
+ vinserti128 m10, m10, xm8, 1
+
+ lea r3, [3 * r1]
+ lea r4, [c_ang16_mode_8]
+
+ pmaddubsw m0, m9, [r4 + 0 * mmsize]
+ pmulhrsw m0, m11
+ pmaddubsw m1, m10, [r4 + 0 * mmsize]
+ pmulhrsw m1, m11
+ packuswb m0, m1
+
+ pmaddubsw m1, m9, [r4 + 1 * mmsize]
+ pmulhrsw m1, m11
+ pmaddubsw m2, m10, [r4 + 1 * mmsize]
+ pmulhrsw m2, m11
+ packuswb m1, m2
+
+ pmaddubsw m2, m9, [r4 + 2 * mmsize]
+ pmulhrsw m2, m11
+ pmaddubsw m3, m10, [r4 + 2 * mmsize]
+ pmulhrsw m3, m11
+ packuswb m2, m3
+
+ pmaddubsw m3, m9, [r4 + 3 * mmsize]
+ pmulhrsw m3, m11
+ pmaddubsw m4, m10, [r4 + 3 * mmsize]
+ pmulhrsw m4, m11
+ packuswb m3, m4
+
+ add r4, 4 * mmsize
+
+ movu xm4, [r2 + 3 + 32]
+ pshufb xm4, [intra_pred_shuff_0_8]
+ vinserti128 m9, m9, xm4, 1
+
+ movu xm5, [r2 + 11 + 32]
+ pshufb xm5, [intra_pred_shuff_0_8]
+ vinserti128 m10, m10, xm5, 1
+
+ pmaddubsw m4, m9, [r4 + 0 * mmsize]
+ pmulhrsw m4, m11
+ pmaddubsw m5, m10, [r4 + 0 * mmsize]
+ pmulhrsw m5, m11
+ packuswb m4, m5
+
+ pmaddubsw m5, m9, [r4 + 1 * mmsize]
+ pmulhrsw m5, m11
+ pmaddubsw m6, m10, [r4 + 1 * mmsize]
+ pmulhrsw m6, m11
+ packuswb m5, m6
+
+ vinserti128 m9, m9, xm7, 0
+ vinserti128 m10, m10, xm8, 0
+
+ pmaddubsw m6, m9, [r4 + 2 * mmsize]
+ pmulhrsw m6, m11
+ pmaddubsw m7, m10, [r4 + 2 * mmsize]
+ pmulhrsw m7, m11
+ packuswb m6, m7
+
+ pmaddubsw m7, m9, [r4 + 3 * mmsize]
+ pmulhrsw m7, m11
+ pmaddubsw m8, m10, [r4 + 3 * mmsize]
+ pmulhrsw m8, m11
+ packuswb m7, m8
+
+ ; transpose and store
+ INTRA_PRED_TRANS_STORE_16x16
+ RET
+
INIT_YMM avx2
cglobal intra_pred_ang16_9, 3, 5, 12
mova m11, [pw_1024]
More information about the x265-devel
mailing list