[x265] [PATCH] asm: fix intrapred_planar32x32 sse2 code for main12
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Tue Nov 17 13:49:25 CET 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1447764366 -19800
# Tue Nov 17 18:16:06 2015 +0530
# Node ID 069c502d4957f36bb5887158b13dfe94d4e0f737
# Parent e8f9a60d4cd9e73c9f2baf05c2ccda5af1892b46
asm: fix intrapred_planar32x32 sse2 code for main12
intra_planar_32x32 7.13x 6106.53 43568.75
diff -r e8f9a60d4cd9 -r 069c502d4957 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Nov 16 16:44:33 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Nov 17 18:16:06 2015 +0530
@@ -963,10 +963,7 @@
p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse2);
p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse2);
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse2);
-
-#if X265_DEPTH <= 10
p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse2);
-#endif /* X265_DEPTH <= 10 */
ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
diff -r e8f9a60d4cd9 -r 069c502d4957 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Mon Nov 16 16:44:33 2015 +0530
+++ b/source/common/x86/intrapred16.asm Tue Nov 17 18:16:06 2015 +0530
@@ -933,6 +933,178 @@
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
INIT_XMM sse2
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+cglobal intra_pred_planar32, 3,7,16
+ ; NOTE: align stack to 64 bytes, so all of local data in same cache line
+ mov r6, rsp
+ sub rsp, 4*mmsize
+ and rsp, ~63
+ %define m16 [rsp + 0 * mmsize]
+ %define m17 [rsp + 1 * mmsize]
+ %define m18 [rsp + 2 * mmsize]
+ %define m19 [rsp + 3 * mmsize]
+
+ add r1, r1
+ pxor m12, m12
+
+ movzx r3d, word [r2 + 66]
+ lea r4, [planar32_table1]
+
+ movd m0, r3d
+ pshufd m0, m0, 0
+
+ pmaddwd m8, m0, [r4 + 0]
+ pmaddwd m9, m0, [r4 + 16]
+ pmaddwd m10, m0, [r4 + 32]
+ pmaddwd m11, m0, [r4 + 48]
+ pmaddwd m7, m0, [r4 + 64]
+ pmaddwd m13, m0, [r4 + 80]
+ pmaddwd m14, m0, [r4 + 96]
+ pmaddwd m15, m0, [r4 + 112]
+
+ movzx r3d, word [r2 + 194]
+ movd m0, r3d
+ pshufd m0, m0, 0
+
+ paddd m8, m0
+ paddd m9, m0
+ paddd m10, m0
+ paddd m11, m0
+ paddd m7, m0
+ paddd m13, m0
+ paddd m14, m0
+ paddd m15, m0
+
+ paddd m8, [pd_32]
+ paddd m9, [pd_32]
+ paddd m10, [pd_32]
+ paddd m11, [pd_32]
+ paddd m7, [pd_32]
+ paddd m13, [pd_32]
+ paddd m14, [pd_32]
+ paddd m15, [pd_32]
+
+ movu m1, [r2 + 2]
+ punpckhwd m5, m1, m12
+ pmaddwd m2, m5, [pd_31]
+ paddd m9, m2
+ psubd m2, m0, m5
+
+ punpcklwd m1, m12
+ pmaddwd m5, m1, [pd_31]
+ paddd m8, m5
+ psubd m3, m0, m1
+
+ movu m1, [r2 + 18]
+ punpckhwd m5, m1, m12
+ pmaddwd m4, m5, [pd_31]
+ paddd m11, m4
+ psubd m4, m0, m5
+
+ punpcklwd m1, m12
+ pmaddwd m5, m1, [pd_31]
+ paddd m10, m5
+ psubd m5, m0, m1
+ mova m16, m5
+
+ movu m1, [r2 + 34]
+ punpckhwd m6, m1, m12
+ psubd m5, m0, m6
+ pmaddwd m6, [pd_31]
+ paddd m13, m6
+
+ punpcklwd m6, m1, m12
+ psubd m1, m0, m6
+ mova m17, m1
+ pmaddwd m6, [pd_31]
+ paddd m7, m6
+
+ movu m1, [r2 + 50]
+ mova m18, m1
+ punpckhwd m6, m1, m12
+ psubd m1, m0, m6
+ pmaddwd m6, [pd_31]
+ paddd m15, m6
+
+ punpcklwd m6, m18, m12
+ psubd m12, m0, m6
+ mova m19, m12
+ pmaddwd m6, [pd_31]
+ paddd m14, m6
+
+ add r2, 130
+ lea r5, [planar32_table]
+
+%macro INTRA_PRED_PLANAR32_sse2 0
+ movzx r3d, word [r2]
+ movd m0, r3d
+ pshufd m0, m0, 0
+
+ pmaddwd m6, m0, [r5]
+ pmaddwd m12, m0, [r5 + 16]
+ paddd m6, m8
+ paddd m12, m9
+ paddd m8, m3
+ paddd m9, m2
+ psrad m6, 6
+ psrad m12, 6
+ packssdw m6, m12
+ movu [r0], m6
+
+ pmaddwd m6, m0, [r5 + 32]
+ pmaddwd m12, m0, [r5 + 48]
+ paddd m6, m10
+ paddd m12, m11
+ paddd m10, m16
+ paddd m11, m4
+ psrad m6, 6
+ psrad m12, 6
+ packssdw m6, m12
+ movu [r0 + 16], m6
+
+ pmaddwd m6, m0, [r5 + 64]
+ pmaddwd m12, m0, [r5 + 80]
+ paddd m6, m7
+ paddd m12, m13
+ paddd m7, m17
+ paddd m13, m5
+ psrad m6, 6
+ psrad m12, 6
+ packssdw m6, m12
+ movu [r0 + 32], m6
+
+ pmaddwd m6, m0, [r5 + 96]
+ pmaddwd m12, m0, [r5 + 112]
+ paddd m6, m14
+ paddd m12, m15
+ paddd m14, m19
+ paddd m15, m1
+ psrad m6, 6
+ psrad m12, 6
+ packssdw m6, m12
+ movu [r0 + 48], m6
+
+ lea r0, [r0 + r1]
+ add r2, 2
+%endmacro
+
+ mov r4, 8
+.loop:
+ INTRA_PRED_PLANAR32_sse2
+ INTRA_PRED_PLANAR32_sse2
+ INTRA_PRED_PLANAR32_sse2
+ INTRA_PRED_PLANAR32_sse2
+ dec r4
+ jnz .loop
+ mov rsp, r6
+ RET
+
+%else
+;code for BIT_DEPTH == 10
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
+INIT_XMM sse2
cglobal intra_pred_planar32, 3,3,16
movd m3, [r2 + 66] ; topRight = above[32]
@@ -1036,6 +1208,7 @@
%assign x x+1
%endrep
RET
+%endif
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
More information about the x265-devel
mailing list