[x265] [PATCH] asm: align tab_ChromaCoeffV constant to 32 bytes bound
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Fri Jun 5 15:29:23 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1433336549 -19800
# Wed Jun 03 18:32:29 2015 +0530
# Node ID 462553e87d22db5d6939ab565b5893cd96ba2eba
# Parent 43afbde189f390c74f580b0d377731b498c7f7ce
asm: align tab_ChromaCoeffV constant to 32 bytes bound
and modify all chroma vertical filters code
diff -r 43afbde189f3 -r 462553e87d22 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Fri Jun 05 11:03:10 2015 +0530
+++ b/source/common/x86/ipfilter16.asm Wed Jun 03 18:32:29 2015 +0530
@@ -44,29 +44,29 @@
dw -2, 16, 54, -4
dw -2, 10, 58, -2
-tab_ChromaCoeffV: times 4 dw 0, 64
- times 4 dw 0, 0
-
- times 4 dw -2, 58
- times 4 dw 10, -2
-
- times 4 dw -4, 54
- times 4 dw 16, -2
-
- times 4 dw -6, 46
- times 4 dw 28, -4
-
- times 4 dw -4, 36
- times 4 dw 36, -4
-
- times 4 dw -4, 28
- times 4 dw 46, -6
-
- times 4 dw -2, 16
- times 4 dw 54, -4
-
- times 4 dw -2, 10
- times 4 dw 58, -2
+const tab_ChromaCoeffV, times 8 dw 0, 64
+ times 8 dw 0, 0
+
+ times 8 dw -2, 58
+ times 8 dw 10, -2
+
+ times 8 dw -4, 54
+ times 8 dw 16, -2
+
+ times 8 dw -6, 46
+ times 8 dw 28, -4
+
+ times 8 dw -4, 36
+ times 8 dw 36, -4
+
+ times 8 dw -4, 28
+ times 8 dw 46, -6
+
+ times 8 dw -2, 16
+ times 8 dw 54, -4
+
+ times 8 dw -2, 10
+ times 8 dw 58, -2
tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0
dw -1, 4, -10, 58, 17, -5, 1, 0
@@ -3292,34 +3292,34 @@
movq m0, [r0]
movq m1, [r0 + r1]
punpcklwd m0, m1 ;m0=[0 1]
- pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
+ pmaddwd m0, [r6 + 0 *32] ;m0=[0+1] Row1
lea r0, [r0 + 2 * r1]
movq m4, [r0]
punpcklwd m1, m4 ;m1=[1 2]
- pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
+ pmaddwd m1, [r6 + 0 *32] ;m1=[1+2] Row2
movq m5, [r0 + r1]
punpcklwd m4, m5 ;m4=[2 3]
- pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
- pmaddwd m4, [r6 + 1 * 16]
+ pmaddwd m2, m4, [r6 + 0 *32] ;m2=[2+3] Row3
+ pmaddwd m4, [r6 + 1 * 32]
paddd m0, m4 ;m0=[0+1+2+3] Row1 done
lea r0, [r0 + 2 * r1]
movq m4, [r0]
punpcklwd m5, m4 ;m5=[3 4]
- pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
- pmaddwd m5, [r6 + 1 * 16]
+ pmaddwd m3, m5, [r6 + 0 *32] ;m3=[3+4] Row4
+ pmaddwd m5, [r6 + 1 * 32]
paddd m1, m5 ;m1 = [1+2+3+4] Row2
movq m5, [r0 + r1]
punpcklwd m4, m5 ;m4=[4 5]
- pmaddwd m4, [r6 + 1 * 16]
+ pmaddwd m4, [r6 + 1 * 32]
paddd m2, m4 ;m2=[2+3+4+5] Row3
movq m4, [r0 + 2 * r1]
punpcklwd m5, m4 ;m5=[5 6]
- pmaddwd m5, [r6 + 1 * 16]
+ pmaddwd m5, [r6 + 1 * 32]
paddd m3, m5 ;m3=[3+4+5+6] Row4
%endmacro
@@ -4022,7 +4022,7 @@
add r1d, r1d
add r3d, r3d
sub r0, r1
- shl r4d, 5
+ shl r4d, 6
%ifdef PIC
lea r5, [tab_ChromaCoeffV]
@@ -4243,7 +4243,7 @@
movd m2, [r0]
punpcklwd m1, m2 ;m1=[1 2]
punpcklqdq m0, m1 ;m0=[0 1 1 2]
- pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2
+ pmaddwd m0, [%1 + 0 *32] ;m0=[0+1 1+2] Row 1-2
movd m1, [r0 + r1]
punpcklwd m2, m1 ;m2=[2 3]
@@ -4253,8 +4253,8 @@
punpcklwd m1, m3 ;m2=[3 4]
punpcklqdq m2, m1 ;m2=[2 3 3 4]
- pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2
- pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4
+ pmaddwd m4, m2, [%1 + 1 * 32] ;m4=[2+3 3+4] Row 1-2
+ pmaddwd m2, [%1 + 0 * 32] ;m2=[2+3 3+4] Row 3-4
paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2
movd m1, [r0 + r1]
@@ -4263,7 +4263,7 @@
movd m4, [r0 + 2 * r1]
punpcklwd m1, m4 ;m1=[5 6]
punpcklqdq m3, m1 ;m2=[4 5 5 6]
- pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4
+ pmaddwd m3, [%1 + 1 * 32] ;m3=[4+5 5+6] Row 3-4
paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4
%endmacro
@@ -4277,7 +4277,7 @@
add r1d, r1d
add r3d, r3d
sub r0, r1
- shl r4d, 5
+ shl r4d, 6
%ifdef PIC
lea r5, [tab_ChromaCoeffV]
@@ -4369,7 +4369,7 @@
add r1d, r1d
add r3d, r3d
sub r0, r1
- shl r4d, 5
+ shl r4d, 6
%ifdef PIC
lea r5, [tab_ChromaCoeffV]
@@ -4403,21 +4403,21 @@
movh m0, [r0]
movh m1, [r0 + r1]
punpcklwd m0, m1 ;m0=[0 1]
- pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
+ pmaddwd m0, [r5 + 0 *32] ;m0=[0+1] Row1
lea r0, [r0 + 2 * r1]
movh m2, [r0]
punpcklwd m1, m2 ;m1=[1 2]
- pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
+ pmaddwd m1, [r5 + 0 *32] ;m1=[1+2] Row2
movh m3, [r0 + r1]
punpcklwd m2, m3 ;m4=[2 3]
- pmaddwd m2, [r5 + 1 * 16]
+ pmaddwd m2, [r5 + 1 * 32]
paddd m0, m2 ;m0=[0+1+2+3] Row1 done
movh m2, [r0 + 2 * r1]
punpcklwd m3, m2 ;m5=[3 4]
- pmaddwd m3, [r5 + 1 * 16]
+ pmaddwd m3, [r5 + 1 * 32]
paddd m1, m3 ;m1=[1+2+3+4] Row2 done
%ifidn %2, ss
@@ -4476,7 +4476,7 @@
add r1d, r1d
add r3d, r3d
sub r0, r1
- shl r4d, 5
+ shl r4d, 6
%ifdef PIC
lea r5, [tab_ChromaCoeffV]
@@ -4610,31 +4610,31 @@
movu m1, [r0]
movu m3, [r0 + r1]
punpcklwd m0, m1, m3
- pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l
+ pmaddwd m0, [r5 + 0 * 32] ;m0 = [0l+1l] Row1l
punpckhwd m1, m3
- pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h
+ pmaddwd m1, [r5 + 0 * 32] ;m1 = [0h+1h] Row1h
movu m4, [r0 + 2 * r1]
punpcklwd m2, m3, m4
- pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l
+ pmaddwd m2, [r5 + 0 * 32] ;m2 = [1l+2l] Row2l
punpckhwd m3, m4
- pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h
+ pmaddwd m3, [r5 + 0 * 32] ;m3 = [1h+2h] Row2h
lea r0, [r0 + 2 * r1]
movu m5, [r0 + r1]
punpcklwd m6, m4, m5
- pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l
+ pmaddwd m6, [r5 + 1 * 32] ;m6 = [2l+3l] Row1l
paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum
punpckhwd m4, m5
- pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h
+ pmaddwd m4, [r5 + 1 * 32] ;m6 = [2h+3h] Row1h
paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum
movu m4, [r0 + 2 * r1]
punpcklwd m6, m5, m4
- pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l
+ pmaddwd m6, [r5 + 1 * 32] ;m6 = [3l+4l] Row2l
paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum
punpckhwd m5, m4
- pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h
+ pmaddwd m5, [r5 + 1 * 32] ;m1 = [3h+4h] Row2h
paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum
%endmacro
@@ -4648,7 +4648,7 @@
add r1d, r1d
add r3d, r3d
sub r0, r1
- shl r4d, 5
+ shl r4d, 6
%ifdef PIC
lea r5, [tab_ChromaCoeffV]
More information about the x265-devel
mailing list