<div dir="ltr">This patch and the following patch "asm: interp_4tap_vert_X[16xN] avx2 10bit code for i420" has not been pushed yet.</div><div class="gmail_extra"><br><div class="gmail_quote">On Fri, Jun 5, 2015 at 6:59 PM, <span dir="ltr"><<a href="mailto:rajesh@multicorewareinc.com" target="_blank">rajesh@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Rajesh Paulraj<<a href="mailto:rajesh@multicorewareinc.com">rajesh@multicorewareinc.com</a>><br>
# Date 1433336549 -19800<br>
# Wed Jun 03 18:32:29 2015 +0530<br>
# Node ID 462553e87d22db5d6939ab565b5893cd96ba2eba<br>
# Parent 43afbde189f390c74f580b0d377731b498c7f7ce<br>
asm: align tab_ChromaCoeffV constant to 32 bytes bound<br>
and modify all chroma vertical filters code<br>
<br>
diff -r 43afbde189f3 -r 462553e87d22 source/common/x86/ipfilter16.asm<br>
--- a/source/common/x86/ipfilter16.asm Fri Jun 05 11:03:10 2015 +0530<br>
+++ b/source/common/x86/ipfilter16.asm Wed Jun 03 18:32:29 2015 +0530<br>
@@ -44,29 +44,29 @@<br>
dw -2, 16, 54, -4<br>
dw -2, 10, 58, -2<br>
<br>
-tab_ChromaCoeffV: times 4 dw 0, 64<br>
- times 4 dw 0, 0<br>
-<br>
- times 4 dw -2, 58<br>
- times 4 dw 10, -2<br>
-<br>
- times 4 dw -4, 54<br>
- times 4 dw 16, -2<br>
-<br>
- times 4 dw -6, 46<br>
- times 4 dw 28, -4<br>
-<br>
- times 4 dw -4, 36<br>
- times 4 dw 36, -4<br>
-<br>
- times 4 dw -4, 28<br>
- times 4 dw 46, -6<br>
-<br>
- times 4 dw -2, 16<br>
- times 4 dw 54, -4<br>
-<br>
- times 4 dw -2, 10<br>
- times 4 dw 58, -2<br>
+const tab_ChromaCoeffV, times 8 dw 0, 64<br>
+ times 8 dw 0, 0<br>
+<br>
+ times 8 dw -2, 58<br>
+ times 8 dw 10, -2<br>
+<br>
+ times 8 dw -4, 54<br>
+ times 8 dw 16, -2<br>
+<br>
+ times 8 dw -6, 46<br>
+ times 8 dw 28, -4<br>
+<br>
+ times 8 dw -4, 36<br>
+ times 8 dw 36, -4<br>
+<br>
+ times 8 dw -4, 28<br>
+ times 8 dw 46, -6<br>
+<br>
+ times 8 dw -2, 16<br>
+ times 8 dw 54, -4<br>
+<br>
+ times 8 dw -2, 10<br>
+ times 8 dw 58, -2<br>
<br>
tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0<br>
dw -1, 4, -10, 58, 17, -5, 1, 0<br>
@@ -3292,34 +3292,34 @@<br>
movq m0, [r0]<br>
movq m1, [r0 + r1]<br>
punpcklwd m0, m1 ;m0=[0 1]<br>
- pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1<br>
+ pmaddwd m0, [r6 + 0 *32] ;m0=[0+1] Row1<br>
<br>
lea r0, [r0 + 2 * r1]<br>
movq m4, [r0]<br>
punpcklwd m1, m4 ;m1=[1 2]<br>
- pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2<br>
+ pmaddwd m1, [r6 + 0 *32] ;m1=[1+2] Row2<br>
<br>
movq m5, [r0 + r1]<br>
punpcklwd m4, m5 ;m4=[2 3]<br>
- pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3<br>
- pmaddwd m4, [r6 + 1 * 16]<br>
+ pmaddwd m2, m4, [r6 + 0 *32] ;m2=[2+3] Row3<br>
+ pmaddwd m4, [r6 + 1 * 32]<br>
paddd m0, m4 ;m0=[0+1+2+3] Row1 done<br>
<br>
lea r0, [r0 + 2 * r1]<br>
movq m4, [r0]<br>
punpcklwd m5, m4 ;m5=[3 4]<br>
- pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4<br>
- pmaddwd m5, [r6 + 1 * 16]<br>
+ pmaddwd m3, m5, [r6 + 0 *32] ;m3=[3+4] Row4<br>
+ pmaddwd m5, [r6 + 1 * 32]<br>
paddd m1, m5 ;m1 = [1+2+3+4] Row2<br>
<br>
movq m5, [r0 + r1]<br>
punpcklwd m4, m5 ;m4=[4 5]<br>
- pmaddwd m4, [r6 + 1 * 16]<br>
+ pmaddwd m4, [r6 + 1 * 32]<br>
paddd m2, m4 ;m2=[2+3+4+5] Row3<br>
<br>
movq m4, [r0 + 2 * r1]<br>
punpcklwd m5, m4 ;m5=[5 6]<br>
- pmaddwd m5, [r6 + 1 * 16]<br>
+ pmaddwd m5, [r6 + 1 * 32]<br>
paddd m3, m5 ;m3=[3+4+5+6] Row4<br>
%endmacro<br>
<br>
@@ -4022,7 +4022,7 @@<br>
add r1d, r1d<br>
add r3d, r3d<br>
sub r0, r1<br>
- shl r4d, 5<br>
+ shl r4d, 6<br>
<br>
%ifdef PIC<br>
lea r5, [tab_ChromaCoeffV]<br>
@@ -4243,7 +4243,7 @@<br>
movd m2, [r0]<br>
punpcklwd m1, m2 ;m1=[1 2]<br>
punpcklqdq m0, m1 ;m0=[0 1 1 2]<br>
- pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2<br>
+ pmaddwd m0, [%1 + 0 *32] ;m0=[0+1 1+2] Row 1-2<br>
<br>
movd m1, [r0 + r1]<br>
punpcklwd m2, m1 ;m2=[2 3]<br>
@@ -4253,8 +4253,8 @@<br>
punpcklwd m1, m3 ;m2=[3 4]<br>
punpcklqdq m2, m1 ;m2=[2 3 3 4]<br>
<br>
- pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2<br>
- pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4<br>
+ pmaddwd m4, m2, [%1 + 1 * 32] ;m4=[2+3 3+4] Row 1-2<br>
+ pmaddwd m2, [%1 + 0 * 32] ;m2=[2+3 3+4] Row 3-4<br>
paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2<br>
<br>
movd m1, [r0 + r1]<br>
@@ -4263,7 +4263,7 @@<br>
movd m4, [r0 + 2 * r1]<br>
punpcklwd m1, m4 ;m1=[5 6]<br>
punpcklqdq m3, m1 ;m2=[4 5 5 6]<br>
- pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4<br>
+ pmaddwd m3, [%1 + 1 * 32] ;m3=[4+5 5+6] Row 3-4<br>
paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4<br>
%endmacro<br>
<br>
@@ -4277,7 +4277,7 @@<br>
add r1d, r1d<br>
add r3d, r3d<br>
sub r0, r1<br>
- shl r4d, 5<br>
+ shl r4d, 6<br>
<br>
%ifdef PIC<br>
lea r5, [tab_ChromaCoeffV]<br>
@@ -4369,7 +4369,7 @@<br>
add r1d, r1d<br>
add r3d, r3d<br>
sub r0, r1<br>
- shl r4d, 5<br>
+ shl r4d, 6<br>
<br>
%ifdef PIC<br>
lea r5, [tab_ChromaCoeffV]<br>
@@ -4403,21 +4403,21 @@<br>
movh m0, [r0]<br>
movh m1, [r0 + r1]<br>
punpcklwd m0, m1 ;m0=[0 1]<br>
- pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1<br>
+ pmaddwd m0, [r5 + 0 *32] ;m0=[0+1] Row1<br>
<br>
lea r0, [r0 + 2 * r1]<br>
movh m2, [r0]<br>
punpcklwd m1, m2 ;m1=[1 2]<br>
- pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2<br>
+ pmaddwd m1, [r5 + 0 *32] ;m1=[1+2] Row2<br>
<br>
movh m3, [r0 + r1]<br>
punpcklwd m2, m3 ;m4=[2 3]<br>
- pmaddwd m2, [r5 + 1 * 16]<br>
+ pmaddwd m2, [r5 + 1 * 32]<br>
paddd m0, m2 ;m0=[0+1+2+3] Row1 done<br>
<br>
movh m2, [r0 + 2 * r1]<br>
punpcklwd m3, m2 ;m5=[3 4]<br>
- pmaddwd m3, [r5 + 1 * 16]<br>
+ pmaddwd m3, [r5 + 1 * 32]<br>
paddd m1, m3 ;m1=[1+2+3+4] Row2 done<br>
<br>
%ifidn %2, ss<br>
@@ -4476,7 +4476,7 @@<br>
add r1d, r1d<br>
add r3d, r3d<br>
sub r0, r1<br>
- shl r4d, 5<br>
+ shl r4d, 6<br>
<br>
%ifdef PIC<br>
lea r5, [tab_ChromaCoeffV]<br>
@@ -4610,31 +4610,31 @@<br>
movu m1, [r0]<br>
movu m3, [r0 + r1]<br>
punpcklwd m0, m1, m3<br>
- pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l<br>
+ pmaddwd m0, [r5 + 0 * 32] ;m0 = [0l+1l] Row1l<br>
punpckhwd m1, m3<br>
- pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h<br>
+ pmaddwd m1, [r5 + 0 * 32] ;m1 = [0h+1h] Row1h<br>
<br>
movu m4, [r0 + 2 * r1]<br>
punpcklwd m2, m3, m4<br>
- pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l<br>
+ pmaddwd m2, [r5 + 0 * 32] ;m2 = [1l+2l] Row2l<br>
punpckhwd m3, m4<br>
- pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h<br>
+ pmaddwd m3, [r5 + 0 * 32] ;m3 = [1h+2h] Row2h<br>
<br>
lea r0, [r0 + 2 * r1]<br>
movu m5, [r0 + r1]<br>
punpcklwd m6, m4, m5<br>
- pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l<br>
+ pmaddwd m6, [r5 + 1 * 32] ;m6 = [2l+3l] Row1l<br>
paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum<br>
punpckhwd m4, m5<br>
- pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h<br>
+ pmaddwd m4, [r5 + 1 * 32] ;m6 = [2h+3h] Row1h<br>
paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum<br>
<br>
movu m4, [r0 + 2 * r1]<br>
punpcklwd m6, m5, m4<br>
- pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l<br>
+ pmaddwd m6, [r5 + 1 * 32] ;m6 = [3l+4l] Row2l<br>
paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum<br>
punpckhwd m5, m4<br>
- pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h<br>
+ pmaddwd m5, [r5 + 1 * 32] ;m1 = [3h+4h] Row2h<br>
paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum<br>
%endmacro<br>
<br>
@@ -4648,7 +4648,7 @@<br>
add r1d, r1d<br>
add r3d, r3d<br>
sub r0, r1<br>
- shl r4d, 5<br>
+ shl r4d, 6<br>
<br>
%ifdef PIC<br>
lea r5, [tab_ChromaCoeffV]<br>
</blockquote></div><br></div>