[x265] [PATCH] asm: fix intrapred_planar16x16 avx2 code for main12
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Tue Nov 3 06:07:13 CET 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1446527171 -19800
# Tue Nov 03 10:36:11 2015 +0530
# Node ID d2c889865d6ff99cfeab4b69e898a4d0514f2440
# Parent 544dfa2c3a16efd0f679374ccc654aa4aefb1a49
asm: fix intrapred_planar16x16 avx2 code for main12
sse4:
intra_planar_16x16 4.42x 2220.50 9825.22
avx2:
intra_planar_16x16 8.53x 1275.02 10878.43
diff -r 544dfa2c3a16 -r d2c889865d6f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Oct 28 11:42:09 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Nov 03 10:36:11 2015 +0530
@@ -1486,12 +1486,12 @@
p.cu[BLOCK_32x32].psy_cost_ss = PFX(psyCost_ss_32x32_avx2);
p.cu[BLOCK_64x64].psy_cost_ss = PFX(psyCost_ss_64x64_avx2);
p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
+ p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
#if X265_DEPTH <= 10
p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
- p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
#endif
diff -r 544dfa2c3a16 -r d2c889865d6f source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Wed Oct 28 11:42:09 2015 +0530
+++ b/source/common/x86/const-a.asm Tue Nov 03 10:36:11 2015 +0530
@@ -137,6 +137,7 @@
const pd_524416, times 4 dd 524416
const pd_n32768, times 8 dd 0xffff8000
const pd_n131072, times 4 dd 0xfffe0000
+const pd_0000ffff, times 8 dd 0x0000FFFF
const pd_planar16_mul0, times 1 dd 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
const pd_planar16_mul1, times 1 dd 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
const trans8_shuf, times 1 dd 0, 4, 1, 5, 2, 6, 3, 7
diff -r 544dfa2c3a16 -r d2c889865d6f source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Wed Oct 28 11:42:09 2015 +0530
+++ b/source/common/x86/intrapred16.asm Tue Nov 03 10:36:11 2015 +0530
@@ -113,6 +113,7 @@
cextern pd_16
cextern pd_31
cextern pd_32
+cextern pd_0000ffff
cextern pw_4096
cextern pw_pixel_max
cextern multiL
@@ -1100,6 +1101,100 @@
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_YMM avx2
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+cglobal intra_pred_planar16, 3,3,11
+ add r1d, r1d
+
+ movzx r4d, word [r2 + 34]
+ movd xm3, r4d
+ vpbroadcastd m3, xm3
+
+ movzx r4d, word [r2 + 98]
+ movd xm4, r4d
+ vpbroadcastd m4, xm4
+
+ pmovzxwd m2, [r2 + 2]
+ pmovzxwd m5, [r2 + 18]
+
+ pmulld m10, m3, [pd_planar16_mul1]
+ pmulld m7, m3, [pd_planar16_mul1 + 32]
+
+ psubd m10, m2
+ pslld m1, m2, 4
+ paddd m10, m1
+
+ psubd m7, m5
+ pslld m6, m5, 4
+ paddd m9, m6, m7
+
+ paddd m10, [pd_16]
+ paddd m9, [pd_16]
+ paddd m7, m10, m4
+ paddd m9, m4
+
+ psubd m0, m4, m2
+ psubd m8, m4, m5
+
+ add r2, 66
+ mova m5, [pd_planar16_mul0]
+ mova m6, [pd_planar16_mul0 + 32]
+ mova m10, [pd_0000ffff]
+
+%macro INTRA_PRED_PLANAR16_AVX2 1
+ vpbroadcastd m2, [r2 + %1]
+ pand m1, m2, m10
+ psrld m2, 16
+
+ pmulld m3, m1, m5
+ pmulld m4, m1, m6
+ pmulld m1, m2, m5
+ pmulld m2, m2, m6
+
+ paddd m3, m7
+ paddd m4, m9
+ paddd m7, m0
+ paddd m9, m8
+
+ psrad m3, 5
+ psrad m4, 5
+
+ paddd m1, m7
+ paddd m2, m9
+
+ psrad m1, 5
+ psrad m2, 5
+
+ paddd m7, m0
+ paddd m9, m8
+
+ packssdw m3, m4
+ packssdw m1, m2
+
+ vpermq m3, m3, q3120
+ vpermq m1, m1, q3120
+
+ movu [r0], m3
+ movu [r0 + r1], m1
+%if %1 <= 24
+ lea r0, [r0 + r1 * 2]
+%endif
+%endmacro
+ INTRA_PRED_PLANAR16_AVX2 0
+ INTRA_PRED_PLANAR16_AVX2 4
+ INTRA_PRED_PLANAR16_AVX2 8
+ INTRA_PRED_PLANAR16_AVX2 12
+ INTRA_PRED_PLANAR16_AVX2 16
+ INTRA_PRED_PLANAR16_AVX2 20
+ INTRA_PRED_PLANAR16_AVX2 24
+ INTRA_PRED_PLANAR16_AVX2 28
+%undef INTRA_PRED_PLANAR16_AVX2
+ RET
+
+%else
+; code for BIT_DEPTH == 10
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
cglobal intra_pred_planar16, 3,3,4
add r1d, r1d
vpbroadcastw m3, [r2 + 34]
@@ -1143,6 +1238,7 @@
INTRA_PRED_PLANAR16_AVX2 28
%undef INTRA_PRED_PLANAR16_AVX2
RET
+%endif
%macro TRANSPOSE_4x4 0
punpckhwd m0, m1, m3
More information about the x265-devel
mailing list