[x265] [PATCH] asm: fix intrapred_planar32x32 avx2 code for main12
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Thu Nov 12 13:29:54 CET 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1447331262 -19800
# Thu Nov 12 17:57:42 2015 +0530
# Node ID fcfb9bd67e1e00a745ce0bb6f151aa9851c197c1
# Parent 45ea73c63c12c66e5e5e777e80853c8b3cadf101
asm: fix intrapred_planar32x32 avx2 code for main12
sse4:
intra_planar_32x32 5.09x 8533.13 43461.37
avx2:
intra_planar_32x32 9.90x 4259.14 42155.95
diff -r 45ea73c63c12 -r fcfb9bd67e1e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Nov 06 11:55:27 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Nov 12 17:57:42 2015 +0530
@@ -1487,12 +1487,12 @@
p.cu[BLOCK_64x64].psy_cost_ss = PFX(psyCost_ss_64x64_avx2);
p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
+ p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
#if X265_DEPTH <= 10
p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
- p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
#endif
p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2);
diff -r 45ea73c63c12 -r fcfb9bd67e1e source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Fri Nov 06 11:55:27 2015 +0530
+++ b/source/common/x86/const-a.asm Thu Nov 12 17:57:42 2015 +0530
@@ -124,7 +124,7 @@
const pd_8, times 4 dd 8
const pd_15, times 8 dd 15
const pd_16, times 8 dd 16
-const pd_31, times 4 dd 31
+const pd_31, times 8 dd 31
const pd_32, times 8 dd 32
const pd_64, times 4 dd 64
const pd_128, times 4 dd 128
@@ -138,8 +138,11 @@
const pd_n32768, times 8 dd 0xffff8000
const pd_n131072, times 4 dd 0xfffe0000
const pd_0000ffff, times 8 dd 0x0000FFFF
-const pd_planar16_mul0, times 1 dd 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-const pd_planar16_mul1, times 1 dd 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+const pd_planar16_mul0, times 1 dd 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+const pd_planar16_mul1, times 1 dd 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+const pd_planar32_mul1, times 1 dd 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
+const pd_planar32_mul2, times 1 dd 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+const pd_planar16_mul2, times 1 dd 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
const trans8_shuf, times 1 dd 0, 4, 1, 5, 2, 6, 3, 7
const popcnt_table
diff -r 45ea73c63c12 -r fcfb9bd67e1e source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Fri Nov 06 11:55:27 2015 +0530
+++ b/source/common/x86/intrapred16.asm Thu Nov 12 17:57:42 2015 +0530
@@ -128,6 +128,9 @@
cextern pd_planar16_mul0
cextern pd_planar16_mul1
cextern pw_planar32_mul
+cextern pd_planar32_mul1
+cextern pd_planar32_mul2
+cextern pd_planar16_mul2
;-----------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
@@ -1038,6 +1041,126 @@
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_YMM avx2
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+cglobal intra_pred_planar32, 3,4,16
+ pmovzxwd m1, [r2 + 2]
+ pmovzxwd m4, [r2 + 34]
+ pmovzxwd m2, [r2 + 18]
+ pmovzxwd m3, [r2 + 50]
+ lea r2, [r2 + 66]
+
+ movzx r3d, word [r2]
+ movd xm5, r3d
+ vpbroadcastd m5, xm5
+
+ pslld m8, m5, 3
+ pmulld m7, m5, [pd_planar16_mul1 + 32]
+ psubd m6, m7, m8
+ pmulld m9, m5, [pd_planar32_mul2 + 32]
+ psubd m8, m9, m8
+
+ movzx r3d, word [r2 + 128]
+ movd xm10, r3d
+ vpbroadcastd m10, xm10
+
+ mova m11, m10
+ paddd m11, [pd_32]
+
+ paddd m6, m11
+ paddd m7, m11
+ paddd m8, m11
+ paddd m9, m11
+
+ psubd m0, m10, m1
+ mova m13, m0
+ pslld m5, m1, 5
+ psubd m1, m5, m1
+ paddd m12, m6, m1
+
+ psubd m5, m10, m4
+ mova m6, m5
+ pslld m1, m4, 5
+ psubd m4, m1, m4
+ paddd m14, m8, m4
+
+ psubd m1, m10, m2
+ mova m8, m1
+ pslld m4, m2, 5
+ psubd m2, m4, m2
+ paddd m7, m2
+
+ psubd m11, m10, m3
+ mova m15, m11
+ pslld m4, m3, 5
+ psubd m3, m4, m3
+ paddd m9, m3
+
+ mova m2, [pd_planar32_mul1 + 32]
+ mova m4, [pd_planar16_mul2 + 32]
+
+ add r1, r1
+
+%macro PROCESS_AVX2 1
+ movzx r3d, word [r2 + %1 * 2]
+ movd xm0, r3d
+ vpbroadcastd m0, xm0
+
+ pmulld m1, m0, m2
+ pslld m3, m0, 3
+ paddd m5, m1, m3
+ pmulld m0, m4
+ paddd m11, m0, m3
+
+ paddd m5, m12
+ paddd m1, m7
+ paddd m11, m14
+ paddd m0, m9
+
+ psrad m5, 6
+ psrad m1, 6
+ psrad m11, 6
+ psrad m0, 6
+
+ packssdw m5, m1
+ packssdw m11, m0
+
+ vpermq m5, m5, q3120
+ vpermq m11, m11, q3120
+
+ movu [r0], m5
+ movu [r0 + mmsize], m11
+%endmacro
+
+%macro INCREMENT_AVX2 0
+ paddd m12, m13
+ paddd m14, m6
+ paddd m7, m8
+ paddd m9, m15
+
+ add r0, r1
+%endmacro
+
+ add r2, mmsize*2
+%assign x 0
+%rep 4
+%assign y 0
+%rep 8
+ PROCESS_AVX2 y
+%if x + y < 10
+ INCREMENT_AVX2
+%endif
+%assign y y+1
+%endrep
+lea r2, [r2 + 16]
+%assign x x+1
+%endrep
+ RET
+
+%else
+; code for BIT_DEPTH == 10
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
cglobal intra_pred_planar32, 3,3,8
movu m1, [r2 + 2]
movu m4, [r2 + 34]
@@ -1096,6 +1219,7 @@
%assign x x+1
%endrep
RET
+%endif
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
More information about the x265-devel
mailing list