<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Thu, Nov 7, 2013 at 9:41 AM, <span dir="ltr"><<a href="mailto:nabajit@multicorewareinc.com" target="_blank">nabajit@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Nabajit Deka<br>
# Date 1383838838 -19800<br>
# Thu Nov 07 21:10:38 2013 +0530<br>
# Node ID a56c53581344df95e54f9cda919419f1d1ad0850<br>
# Parent 85002898f5b4308547af6ce464bbdff5f360fa13<br>
Bug fix for luma vpp asm routines.Also incorporated review comment changes.<br></blockquote><div><br></div><div>Great, now the luma vpp assembly functions are enabled for motion compensation in the encoder.</div><div><br>
</div><div>It would be really helpful if we could catch issues like this in our testbench. Perhaps we need to brainstorm a bit on how to do that.</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
diff -r 85002898f5b4 -r a56c53581344 source/common/x86/ipfilter8.asm<br>
--- a/source/common/x86/ipfilter8.asm Thu Nov 07 14:31:05 2013 +0530<br>
+++ b/source/common/x86/ipfilter8.asm Thu Nov 07 21:10:38 2013 +0530<br>
@@ -2188,17 +2188,17 @@<br>
movd m0, [r0 + 2 * r1]<br>
punpcklbw m1, m0 ; m1=[1 2]<br>
punpcklqdq m2, m1 ; m2=[0 1 1 2]<br>
- pmaddubsw m7, m2, [r6 + 0 * 16] ; m7 = [0+1 1+2]<br>
+ pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2]<br>
<br>
lea r0, [r0 + 2 * r1]<br>
movd m1, [r0 + r1]<br>
- punpcklbw m6, m0, m1 ; m2=[2 3]<br>
+ punpcklbw m5, m0, m1 ; m2=[2 3]<br>
movd m0, [r0 + 2 * r1]<br>
punpcklbw m1, m0 ; m1=[3 4]<br>
- punpcklqdq m6, m1 ; m6=[2 3 3 4]<br>
- pmaddubsw m2, m6, [r6 + 1 * 16] ; m2 = [2+3 3+4]<br>
- paddw m7, m2 ; m7=[0+1+2+3 1+2+3+4] Row1-2<br>
- pmaddubsw m6, [r6 + 0 * 16] ; m6 = [2+3 3+4] Row3-4<br>
+ punpcklqdq m5, m1 ; m5=[2 3 3 4]<br>
+ pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4]<br>
+ paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2<br>
+ pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4<br>
<br>
lea r0, [r0 + 2 * r1]<br>
movd m1, [r0 + r1]<br>
@@ -2206,10 +2206,10 @@<br>
movd m0, [r0 + 2 * r1]<br>
punpcklbw m1, m0 ; m1=[5 6]<br>
punpcklqdq m2, m1 ; m2=[4 5 5 6]<br>
- pmaddubsw m1, m2, [r6 + 2 * 16] ; m1 = [4+5 5+6]<br>
- paddw m7, m1 ; m7=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2<br>
- pmaddubsw m2, [r6 + 1 * 16] ; m2 = [4+5 5+6]<br>
- paddw m6, m2 ; m6=[2+3+4+5 3+4+5+6] Row3-4<br>
+ pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6]<br>
+ paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2<br>
+ pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6]<br>
+ paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4<br>
<br>
lea r0, [r0 + 2 * r1]<br>
movd m1, [r0 + r1]<br>
@@ -2217,10 +2217,10 @@<br>
movd m0, [r0 + 2 * r1]<br>
punpcklbw m1, m0 ; m1=[7 8]<br>
punpcklqdq m2, m1 ; m2=[6 7 7 8]<br>
- pmaddubsw m1, m2, [r6 + 3 * 16] ; m1 = [6+7 7+8]<br>
- paddw m7, m1 ; m7=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end<br>
+ pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8]<br>
+ paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end<br>
pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8]<br>
- paddw m6, m2 ; m6=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4<br>
+ paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4<br>
<br>
lea r0, [r0 + 2 * r1]<br>
movd m1, [r0 + r1]<br>
@@ -2228,30 +2228,30 @@<br>
movd m0, [r0 + 2 * r1]<br>
punpcklbw m1, m0 ; m1=[9 10]<br>
punpcklqdq m2, m1 ; m2=[8 9 9 10]<br>
- pmaddubsw m2, [r6 + 3 * 16] ; m2 = [8+9 9+10]<br>
- paddw m6, m2 ; m6=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end<br>
+ pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10]<br>
+ paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end<br>
%endmacro<br>
<br>
%macro PROCESS_LUMA_W8_4R 0<br>
movq m0, [r0]<br>
movq m1, [r0 + r1]<br>
punpcklbw m0, m1<br>
- pmaddubsw m7, m0, [r6 + 0 *16] ;m7 = [0+1] Row1<br>
+ pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1<br>
<br>
movq m0, [r0 + 2 * r1]<br>
punpcklbw m1, m0<br>
- pmaddubsw m6, m1, [r6 + 0 *16] ;m6 = [1+2] Row2<br>
+ pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2<br>
<br>
lea r0, [r0 + 2 * r1]<br>
movq m1, [r0 + r1]<br>
punpcklbw m0, m1<br>
- pmaddubsw m5, m0, [r6 + 0 *16] ;m5 = [2+3] Row3<br>
+ pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3<br>
pmaddubsw m0, [r6 + 1 * 16]<br>
- paddw m7, m0 ;m7 = [0+1+2+3] Row1<br>
+ paddw m7, m0 ;m7=[0+1+2+3] Row1<br>
<br>
movq m0, [r0 + 2 * r1]<br>
punpcklbw m1, m0<br>
- pmaddubsw m4, m1, [r6 + 0 *16] ;m4 = [3+4] Row4<br>
+ pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4<br>
pmaddubsw m1, [r6 + 1 * 16]<br>
paddw m6, m1 ;m6 = [1+2+3+4] Row2<br>
<br>
@@ -2260,41 +2260,41 @@<br>
punpcklbw m0, m1<br>
pmaddubsw m2, m0, [r6 + 1 * 16]<br>
pmaddubsw m0, [r6 + 2 * 16]<br>
- paddw m7, m0 ;m7 = [0+1+2+3+4+5] Row1<br>
- paddw m5, m2 ;m5 = [2+3+4+5] Row3<br>
+ paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1<br>
+ paddw m5, m2 ;m5=[2+3+4+5] Row3<br>
<br>
movq m0, [r0 + 2 * r1]<br>
punpcklbw m1, m0<br>
pmaddubsw m2, m1, [r6 + 1 * 16]<br>
pmaddubsw m1, [r6 + 2 * 16]<br>
- paddw m6, m1 ;m6 = [1+2+3+4+5+6] Row2<br>
- paddw m4, m2 ;m4 = [3+4+5+6] Row4<br>
+ paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2<br>
+ paddw m4, m2 ;m4=[3+4+5+6] Row4<br>
<br>
lea r0, [r0 + 2 * r1]<br>
movq m1, [r0 + r1]<br>
punpcklbw m0, m1<br>
pmaddubsw m2, m0, [r6 + 2 * 16]<br>
pmaddubsw m0, [r6 + 3 * 16]<br>
- paddw m7, m0 ;m7 = [0+1+2+3+4+5+6+7] Row1 end<br>
- paddw m5, m2 ;m5 = [2+3+4+5+6+7] Row3<br>
+ paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end<br>
+ paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3<br>
<br>
movq m0, [r0 + 2 * r1]<br>
punpcklbw m1, m0<br>
pmaddubsw m2, m1, [r6 + 2 * 16]<br>
pmaddubsw m1, [r6 + 3 * 16]<br>
- paddw m6, m1 ;m6 = [1+2+3+4+5+6+7+8] Row2 end<br>
- paddw m4, m2 ;m4 = [3+4+5+6+7+8] Row4<br>
+ paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end<br>
+ paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4<br>
<br>
lea r0, [r0 + 2 * r1]<br>
movq m1, [r0 + r1]<br>
punpcklbw m0, m1<br>
pmaddubsw m0, [r6 + 3 * 16]<br>
- paddw m5, m0 ;m5 = [2+3+4+5+6+7+8+9] Row3 end<br>
+ paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end<br>
<br>
movq m0, [r0 + 2 * r1]<br>
punpcklbw m1, m0<br>
pmaddubsw m1, [r6 + 3 * 16]<br>
- paddw m4, m1 ;m4 = [3+4+5+6+7+8+9+10] Row4 end<br>
+ paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end<br>
%endmacro<br>
<br>
;-------------------------------------------------------------------------------------------------------------<br>
@@ -2306,7 +2306,7 @@<br>
lea r5, [r1 + 2 * r1]<br>
sub r0, r5<br>
shl r4d, 6<br>
-%ifidn %3, ps<br>
+%ifidn %3,ps<br>
add r3d, r3d<br>
%endif<br>
<br>
@@ -2317,7 +2317,7 @@<br>
lea r6, [tab_LumaCoeffVer + r4]<br>
%endif<br>
<br>
-%ifidn %3, pp<br>
+%ifidn %3,pp<br>
mova m3, [tab_c_512]<br>
%else<br>
mova m3, [tab_c_8192]<br>
@@ -2328,29 +2328,29 @@<br>
.loopH<br>
PROCESS_LUMA_W4_4R<br>
<br>
-%ifidn %3, pp<br>
- pmulhrsw m7, m3<br>
- pmulhrsw m6, m3<br>
-<br>
- packuswb m7, m7<br>
- packuswb m6, m6<br>
-<br>
- movd [r2], m7<br>
- pshufd m7, m7, 1<br>
- movd [r2 + r3], m7<br>
- movd [r2 + 2 * r3], m6<br>
- pshufd m6, m6, 1<br>
+%ifidn %3,pp<br>
+ pmulhrsw m4, m3<br>
+ pmulhrsw m5, m3<br>
+<br>
+ packuswb m4, m4<br>
+ packuswb m5, m5<br>
+<br>
+ movd [r2], m4<br>
+ pshufd m4, m4, 1<br>
+ movd [r2 + r3], m4<br>
+ movd [r2 + 2 * r3], m5<br>
+ pshufd m5, m5, 1<br>
lea r5, [r3 + 2 * r3]<br>
- movd [r2 + r5], m6<br>
+ movd [r2 + r5], m5<br>
%else<br>
- psubw m7, m3<br>
- psubw m6, m3<br>
-<br>
- movlps [r2], m7<br>
- movhps [r2 + r3], m7<br>
- movlps [r2 + 2 * r3], m6<br>
+ psubw m4, m3<br>
+ psubw m5, m3<br>
+<br>
+ movlps [r2], m4<br>
+ movhps [r2 + r3], m4<br>
+ movlps [r2 + 2 * r3], m5<br>
lea r5, [r3 + 2 * r3]<br>
- movhps [r2 + r5], m6<br>
+ movhps [r2 + r5], m5<br>
%endif<br>
<br>
lea r5, [4 * r1]<br>
@@ -2403,7 +2403,7 @@<br>
sub r0, r5<br>
shl r4d, 6<br>
<br>
-%ifidn %3, ps<br>
+%ifidn %3,ps<br>
add r3d, r3d<br>
%endif<br>
<br>
@@ -2414,7 +2414,7 @@<br>
lea r6, [tab_LumaCoeffVer + r4]<br>
%endif<br>
<br>
- %ifidn %3, pp<br>
+ %ifidn %3,pp<br>
mova m3, [tab_c_512]<br>
%else<br>
mova m3, [tab_c_8192]<br>
@@ -2425,7 +2425,7 @@<br>
.loopH<br>
PROCESS_LUMA_W8_4R<br>
<br>
-%ifidn %3, pp<br>
+%ifidn %3,pp<br>
pmulhrsw m7, m3<br>
pmulhrsw m6, m3<br>
pmulhrsw m5, m3<br>
@@ -2440,16 +2440,16 @@<br>
lea r5, [r3 + 2 * r3]<br>
movhps [r2 + r5], m5<br>
%else<br>
- psubw m7, m3<br>
- psubw m6, m3<br>
- psubw m5, m3<br>
- psubw m4, m3<br>
-<br>
- movu [r2], m7<br>
- movu [r2 + r3], m6<br>
- movu [r2 + 2 * r3], m5<br>
- lea r5, [r3 + 2 * r3]<br>
- movu [r2 + r5], m4<br>
+ psubw m7, m3<br>
+ psubw m6, m3<br>
+ psubw m5, m3<br>
+ psubw m4, m3<br>
+<br>
+ movu [r2], m7<br>
+ movu [r2 + r3], m6<br>
+ movu [r2 + 2 * r3], m5<br>
+ lea r5, [r3 + 2 * r3]<br>
+ movu [r2 + r5], m4<br>
%endif<br>
<br>
lea r5, [4 * r1]<br>
@@ -2511,7 +2511,7 @@<br>
lea r5, [r1 + 2 * r1]<br>
sub r0, r5<br>
shl r4d, 6<br>
-%ifidn %3, ps<br>
+%ifidn %3,ps<br>
add r3d, r3d<br>
%endif<br>
<br>
@@ -2522,7 +2522,7 @@<br>
lea r6, [tab_LumaCoeffVer + r4]<br>
%endif<br>
<br>
- %ifidn %3, pp<br>
+ %ifidn %3,pp<br>
mova m3, [tab_c_512]<br>
%else<br>
mova m3, [tab_c_8192]<br>
@@ -2533,7 +2533,7 @@<br>
.loopH<br>
PROCESS_LUMA_W8_4R<br>
<br>
-%ifidn %3, pp<br>
+%ifidn %3,pp<br>
pmulhrsw m7, m3<br>
pmulhrsw m6, m3<br>
pmulhrsw m5, m3<br>
@@ -2548,21 +2548,21 @@<br>
lea r5, [r3 + 2 * r3]<br>
movhps [r2 + r5], m5<br>
%else<br>
- psubw m7, m3<br>
- psubw m6, m3<br>
- psubw m5, m3<br>
- psubw m4, m3<br>
-<br>
- movu [r2], m7<br>
- movu [r2 + r3], m6<br>
- movu [r2 + 2 * r3], m5<br>
- lea r5, [r3 + 2 * r3]<br>
- movu [r2 + r5], m4<br>
+ psubw m7, m3<br>
+ psubw m6, m3<br>
+ psubw m5, m3<br>
+ psubw m4, m3<br>
+<br>
+ movu [r2], m7<br>
+ movu [r2 + r3], m6<br>
+ movu [r2 + 2 * r3], m5<br>
+ lea r5, [r3 + 2 * r3]<br>
+ movu [r2 + r5], m4<br>
%endif<br>
<br>
lea r5, [8 * r1 - 8]<br>
sub r0, r5<br>
-%ifidn %3, pp<br>
+%ifidn %3,pp<br>
add r2, 8<br>
%else<br>
add r2, 16<br>
@@ -2570,34 +2570,34 @@<br>
<br>
PROCESS_LUMA_W4_4R<br>
<br>
-%ifidn %3, pp<br>
- pmulhrsw m7, m3<br>
- pmulhrsw m6, m3<br>
-<br>
- packuswb m7, m7<br>
- packuswb m6, m6<br>
-<br>
- movd [r2], m7<br>
- pshufd m7, m7, 1<br>
- movd [r2 + r3], m7<br>
- movd [r2 + 2 * r3], m6<br>
- pshufd m6, m6, 1<br>
+%ifidn %3,pp<br>
+ pmulhrsw m4, m3<br>
+ pmulhrsw m5, m3<br>
+<br>
+ packuswb m4, m4<br>
+ packuswb m5, m5<br>
+<br>
+ movd [r2], m4<br>
+ pshufd m4, m4, 1<br>
+ movd [r2 + r3], m4<br>
+ movd [r2 + 2 * r3], m5<br>
+ pshufd m5, m5, 1<br>
lea r5, [r3 + 2 * r3]<br>
- movd [r2 + r5], m6<br>
+ movd [r2 + r5], m5<br>
%else<br>
- psubw m7, m3<br>
- psubw m6, m3<br>
-<br>
- movlps [r2], m7<br>
- movhps [r2 + r3], m7<br>
- movlps [r2 + 2 * r3], m6<br>
+ psubw m4, m3<br>
+ psubw m5, m3<br>
+<br>
+ movlps [r2], m4<br>
+ movhps [r2 + r3], m4<br>
+ movlps [r2 + 2 * r3], m5<br>
lea r5, [r3 + 2 * r3]<br>
- movhps [r2 + r5], m6<br>
+ movhps [r2 + r5], m5<br>
%endif<br>
<br>
lea r5, [4 * r1 + 8]<br>
sub r0, r5<br>
-%ifidn %3, pp<br>
+%ifidn %3,pp<br>
lea r2, [r2 + 4 * r3 - 8]<br>
%else<br>
lea r2, [r2 + 4 * r3 - 16]<br>
@@ -2628,7 +2628,7 @@<br>
lea r5, [r1 + 2 * r1]<br>
sub r0, r5<br>
shl r4d, 6<br>
-%ifidn %3, ps<br>
+%ifidn %3,ps<br>
add r3d, r3d<br>
%endif<br>
<br>
@@ -2639,7 +2639,7 @@<br>
lea r6, [tab_LumaCoeffVer + r4]<br>
%endif<br>
<br>
-%ifidn %3, pp<br>
+%ifidn %3,pp<br>
mova m3, [tab_c_512]<br>
%else<br>
mova m3, [tab_c_8192]<br>
@@ -2650,7 +2650,7 @@<br>
mov r4d, (%1/8)<br>
.loopW<br>
PROCESS_LUMA_W8_4R<br>
-%ifidn %3, pp<br>
+%ifidn %3,pp<br>
pmulhrsw m7, m3<br>
pmulhrsw m6, m3<br>
pmulhrsw m5, m3<br>
@@ -2665,30 +2665,30 @@<br>
lea r5, [r3 + 2 * r3]<br>
movhps [r2 + r5], m5<br>
%else<br>
- psubw m7, m3<br>
- psubw m6, m3<br>
- psubw m5, m3<br>
- psubw m4, m3<br>
-<br>
- movu [r2], m7<br>
- movu [r2 + r3], m6<br>
- movu [r2 + 2 * r3], m5<br>
- lea r5, [r3 + 2 * r3]<br>
- movu [r2 + r5], m4<br>
+ psubw m7, m3<br>
+ psubw m6, m3<br>
+ psubw m5, m3<br>
+ psubw m4, m3<br>
+<br>
+ movu [r2], m7<br>
+ movu [r2 + r3], m6<br>
+ movu [r2 + 2 * r3], m5<br>
+ lea r5, [r3 + 2 * r3]<br>
+ movu [r2 + r5], m4<br>
%endif<br>
<br>
lea r5, [8 * r1 - 8]<br>
sub r0, r5<br>
-%ifidn %3, pp<br>
+%ifidn %3,pp<br>
add r2, 8<br>
%else<br>
add r2, 16<br>
%endif<br>
dec r4d<br>
- jnz .loopW<br>
+ jnz .loopW<br>
<br>
lea r0, [r0 + 4 * r1 - %1]<br>
-%ifidn %3, pp<br>
+%ifidn %3,pp<br>
lea r2, [r2 + 4 * r3 - %1]<br>
%else<br>
lea r2, [r2 + 4 * r3 - 2 * %1]<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>