<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><DIV>>@@ -1154,14 +1154,18 @@<BR>> ; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)<BR>> ;-----------------------------------------------------------------------------<BR>> INIT_XMM sse4<BR>>-cglobal dequant_normal, 2,5,8<BR>>- movd m1, r3m ; m1 = word [scale]<BR>>+cglobal dequant_normal, 4,6,5<BR>>+ movd m1, r3 ; m1 = word [scale]<BR>>+ cmp r3d, 255<BR>>+ jle .skip<BR>>+ psrld m1, 2<BR>>+.skip:<BR>> mov r4d, r4m<BR>> movd m0, r4d ; m0 = shift<BR>>- xor r3d, r3d<BR>>+ xor r5d, r5d<BR>> dec r4d<BR>>- bts r3d, r4d<BR>>- movd m2, r3d<BR>>+ bts r5d, r4d<BR>>+ movd m2, r5d<BR>> punpcklwd m1, m2<BR>> pshufd m1, m1, 0 ; m1 = dword [add scale]<BR>> mova m2, [pw_1]<BR>>@@ -1174,6 +1178,10 @@<BR>> movu m3, [r0]<BR>> movu m4, [r0 + 16]<BR>> packssdw m3, m4 ; m3 = clipQCoef<BR>>+ cmp r3d, 255<BR>>+ jle .skip1<BR>>+ psllw m3, 2<BR>>+.skip1:<BR>> punpckhwd m4, m3, m2<BR>> punpcklwd m3, m2<BR>> pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)<BR></DIV>
<DIV> </DIV>
<DIV>Don't use jmp on inner loop, it is low performance</DIV>
<DIV>you can do it like two code block and jmp on function head</DIV>
<DIV> </DIV></div>