<div dir="ltr"><div class="gmail_default" style="font-family:trebuchet ms,sans-serif;font-size:small">please ignore this patch<br></div></div><div class="gmail_extra"><br clear="all"><div><div dir="ltr"><div><div><span style="color:rgb(68,68,68)"><span style="background-color:rgb(255,255,255)"><span style="color:rgb(0,0,0)"><font><span style="font-family:trebuchet ms,sans-serif">---</span></font></span><i><font><span style="font-family:trebuchet ms,sans-serif"><br>
</span></font></i></span></span></div><span style="color:rgb(68,68,68)"><span style="background-color:rgb(255,255,255)"><span style="color:rgb(0,0,0)"><span style="font-family:trebuchet ms,sans-serif"><font>Dnyaneshwar G</font></span></span><font><span style="font-family:trebuchet ms,sans-serif"><span style="color:rgb(0,0,0)"><span></span></span><br>
</span></font></span></span></div></div></div>
<br><br><div class="gmail_quote">On Tue, Dec 10, 2013 at 6:34 PM, <span dir="ltr"><<a href="mailto:dnyaneshwar@multicorewareinc.com" target="_blank">dnyaneshwar@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
# HG changeset patch<br>
# User Dnyaneshwar G <<a href="mailto:dnyaneshwar@multicorewareinc.com">dnyaneshwar@multicorewareinc.com</a>><br>
# Date 1386680027 -19800<br>
# Tue Dec 10 18:23:47 2013 +0530<br>
# Node ID 981a0e6d10fb3df403329664d5e4efdee0578a9c<br>
# Parent 7af37d60e4437602cde5ab17357812733741ac1d<br>
assembly code for intra_pred_planar_16x16 for 10 and 12-bit<br>
<br>
diff -r 7af37d60e443 -r 981a0e6d10fb source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp Tue Dec 10 17:10:30 2013 +0530<br>
+++ b/source/common/x86/asm-primitives.cpp Tue Dec 10 18:23:47 2013 +0530<br>
@@ -721,6 +721,7 @@<br>
{<br>
p.intra_pred[BLOCK_4x4][0] = x265_intra_pred_planar4_sse4;<br>
p.intra_pred[BLOCK_8x8][0] = x265_intra_pred_planar8_sse4;<br>
+ p.intra_pred[BLOCK_16x16][0] = x265_intra_pred_planar16_sse4;<br>
<br>
p.intra_pred[BLOCK_4x4][1] = x265_intra_pred_dc4_sse4;<br>
p.intra_pred[BLOCK_8x8][1] = x265_intra_pred_dc8_sse4;<br>
diff -r 7af37d60e443 -r 981a0e6d10fb source/common/x86/intrapred16.asm<br>
--- a/source/common/x86/intrapred16.asm Tue Dec 10 17:10:30 2013 +0530<br>
+++ b/source/common/x86/intrapred16.asm Tue Dec 10 18:23:47 2013 +0530<br>
@@ -3,6 +3,7 @@<br>
;*<br>
;* Authors: Dnyaneshwar Gorade <<a href="mailto:dnyaneshwar@multicorewareinc.com">dnyaneshwar@multicorewareinc.com</a>><br>
;* Yuvaraj Venkatesh <<a href="mailto:yuvaraj@multicorewareinc.com">yuvaraj@multicorewareinc.com</a>><br>
+;* Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>> <<a href="mailto:min.chen@multicorewareinc.com">min.chen@multicorewareinc.com</a>><br>
;*<br>
;* This program is free software; you can redistribute it and/or modify<br>
;* it under the terms of the GNU General Public License as published by<br>
@@ -34,6 +35,7 @@<br>
%assign x x+1<br>
%endrep<br>
<br>
+const pw_unpack0wd, times 4 db 0,1,8,8<br>
<br>
SECTION .text<br>
<br>
@@ -43,6 +45,7 @@<br>
cextern pd_32<br>
cextern pw_4096<br>
cextern multiL<br>
+cextern multiH<br>
cextern multi_2Row<br>
<br>
<br>
@@ -542,6 +545,188 @@<br>
RET<br>
<br>
<br>
+;-----------------------------------------------------------------------------------------------------------<br>
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)<br>
+;-----------------------------------------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+%if (BIT_DEPTH == 12)<br>
+<br>
+%if (ARCH_X86_64 == 1)<br>
+cglobal intra_pred_planar16, 4,7,8+3<br>
+ %define bottomRow0 m7<br>
+ %define bottomRow1 m8<br>
+ %define bottomRow2 m9<br>
+ %define bottomRow3 m10<br>
+%else<br>
+cglobal intra_pred_planar16, 4,7,8, 0-3*mmsize<br>
+ %define bottomRow0 [rsp + 0*mmsize]<br>
+ %define bottomRow1 [rsp + 1*mmsize]<br>
+ %define bottomRow2 [rsp + 2*mmsize]<br>
+ %define bottomRow3 m7<br>
+%endif<br>
+<br>
+ add r2, 2<br>
+ add r3, 2<br>
+ add r1, r1<br>
+<br>
+ pxor m0, m0<br>
+<br>
+ ; bottomRow<br>
+ movzx r4d, word [r2 + 16*2]<br>
+ movd m1, r4d<br>
+ pshufd m1, m1, 0 ; m1 = bottomLeft<br>
+ movu m2, [r3]<br>
+ pmovzxwd m3, m2<br>
+ punpckhwd m2, m0<br>
+ psubd m4, m1, m3<br>
+ mova bottomRow0, m4<br>
+ psubd m4, m1, m2<br>
+ mova bottomRow1, m4<br>
+ movu m2, [r3 + 16]<br>
+ pmovzxwd m3, m2<br>
+ punpckhwd m2, m0<br>
+ psubd m4, m1, m3<br>
+ mova bottomRow2, m4<br>
+ psubd m1, m2<br>
+ mova bottomRow3, m1<br>
+<br>
+ ; topRow<br>
+ pmovzxwd m0, [r3 + 0*8]<br>
+ pslld m0, 4<br>
+ pmovzxwd m1, [r3 + 1*8]<br>
+ pslld m1, 4<br>
+ pmovzxwd m2, [r3 + 2*8]<br>
+ pslld m2, 4<br>
+ pmovzxwd m3, [r3 + 3*8]<br>
+ pslld m3, 4<br>
+<br>
+ xor r6, r6<br>
+.loopH:<br>
+ movzx r4d, word [r2 + r6*2]<br>
+ movzx r5d, word [r3 + 16*2] ; r5 = topRight<br>
+ sub r5d, r4d<br>
+ movd m5, r5d<br>
+ pshuflw m5, m5, 0<br>
+ pmullw m5, [multiL]<br>
+ pmovsxwd m5, m5 ; m5 = rightCol<br>
+ add r4d, r4d<br>
+ lea r4d, [r4d * 8 + 16]<br>
+ movd m4, r4d<br>
+ pshufd m4, m4, 0 ; m4 = horPred<br>
+ paddd m4, m5<br>
+ pshufd m6, m5, 0xFF ; m6 = [4 4 4 4]<br>
+<br>
+ ; 0-3<br>
+ paddd m0, bottomRow0<br>
+ paddd m5, m0, m4<br>
+ psrad m5, 5<br>
+ packusdw m5, m5<br>
+ movh [r0 + 0*8], m5<br>
+<br>
+ ; 4-7<br>
+ paddd m4, m6<br>
+ paddd m1, bottomRow1<br>
+ paddd m5, m1, m4<br>
+ psrad m5, 5<br>
+ packusdw m5, m5<br>
+ movh [r0 + 1*8], m5<br>
+<br>
+ ; 8-11<br>
+ paddd m4, m6<br>
+ paddd m2, bottomRow2<br>
+ paddd m5, m2, m4<br>
+ psrad m5, 5<br>
+ packusdw m5, m5<br>
+ movh [r0 + 2*8], m5<br>
+<br>
+ ; 12-15<br>
+ paddd m4, m6<br>
+ paddd m3, bottomRow3<br>
+ paddd m5, m3, m4<br>
+ psrad m5, 5<br>
+ packusdw m5, m5<br>
+ movh [r0 + 3*8], m5<br>
+<br>
+ add r0, r1<br>
+ inc r6d<br>
+ cmp r6d, 16<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%else ; BIT-DEPTH == 10<br>
+INIT_XMM sse4<br>
+cglobal intra_pred_planar16, 4,6,7<br>
+ add r2, 2<br>
+ add r3, 2<br>
+ add r1, r1<br>
+<br>
+ movu m1, [r3] ; topRow[0-7]<br>
+ movu m2, [r3 + 16] ; topRow[8-15]<br>
+<br>
+ movd m3, [r2 + 32]<br>
+ pshuflw m3, m3, 0<br>
+ pshufd m3, m3, 0<br>
+ movzx r4d, word [r3 + 32] ; topRight = above[16]<br>
+<br>
+ psubw m4, m3, m1 ; v_bottomRow[0]<br>
+ psubw m3, m2 ; v_bottomRow[1]<br>
+<br>
+ psllw m1, 4<br>
+ psllw m2, 4<br>
+<br>
+%macro PRED_PLANAR_ROW16 1<br>
+ movzx r5d, word [r2 + %1 * 2]<br>
+ add r5d, r5d<br>
+ lea r5d, [r5d * 8 + 16]<br>
+ movd m5, r5d<br>
+ pshuflw m5, m5, 0<br>
+ pshufd m5, m5, 0 ; horPred<br>
+<br>
+ movzx r5d, word [r2 + %1 * 2]<br>
+ mov r3d, r4d<br>
+ sub r3d, r5d<br>
+ movd m0, r3d<br>
+ pshuflw m0, m0, 0<br>
+ pshufd m0, m0, 0<br>
+<br>
+ pmullw m6, m0, [multiL]<br>
+ paddw m6, m5<br>
+ paddw m1, m4<br>
+ paddw m6, m1<br>
+ psraw m6, 5<br>
+<br>
+ pmullw m0, m0, [multiH]<br>
+ paddw m5, m0<br>
+ paddw m2, m3<br>
+ paddw m5, m2<br>
+ psraw m5, 5<br>
+<br>
+ movu [r0], m6<br>
+ movu [r0 + 16], m5<br>
+ add r0, r1<br>
+%endmacro<br>
+<br>
+ PRED_PLANAR_ROW16 0<br>
+ PRED_PLANAR_ROW16 1<br>
+ PRED_PLANAR_ROW16 2<br>
+ PRED_PLANAR_ROW16 3<br>
+ PRED_PLANAR_ROW16 4<br>
+ PRED_PLANAR_ROW16 5<br>
+ PRED_PLANAR_ROW16 6<br>
+ PRED_PLANAR_ROW16 7<br>
+ PRED_PLANAR_ROW16 8<br>
+ PRED_PLANAR_ROW16 9<br>
+ PRED_PLANAR_ROW16 10<br>
+ PRED_PLANAR_ROW16 11<br>
+ PRED_PLANAR_ROW16 12<br>
+ PRED_PLANAR_ROW16 13<br>
+ PRED_PLANAR_ROW16 14<br>
+ PRED_PLANAR_ROW16 15<br>
+%undef PRED_PLANAR_ROW16<br>
+<br>
+ RET<br>
+%endif<br>
+<br>
;-----------------------------------------------------------------------------<br>
; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)<br>
;-----------------------------------------------------------------------------<br>
</blockquote></div><br></div>