[x265] [PATCH] asm: align tab_ChromaCoeffV constant to 32 bytes bound
Rajesh Paulraj
rajesh at multicorewareinc.com
Tue Jun 9 07:18:53 CEST 2015
This patch and the following patch "asm: interp_4tap_vert_X[16xN] avx2
10bit code for i420" has not been pushed yet.
On Fri, Jun 5, 2015 at 6:59 PM, <rajesh at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Rajesh Paulraj<rajesh at multicorewareinc.com>
> # Date 1433336549 -19800
> # Wed Jun 03 18:32:29 2015 +0530
> # Node ID 462553e87d22db5d6939ab565b5893cd96ba2eba
> # Parent 43afbde189f390c74f580b0d377731b498c7f7ce
> asm: align tab_ChromaCoeffV constant to 32 bytes bound
> and modify all chroma vertical filters code
>
> diff -r 43afbde189f3 -r 462553e87d22 source/common/x86/ipfilter16.asm
> --- a/source/common/x86/ipfilter16.asm Fri Jun 05 11:03:10 2015 +0530
> +++ b/source/common/x86/ipfilter16.asm Wed Jun 03 18:32:29 2015 +0530
> @@ -44,29 +44,29 @@
> dw -2, 16, 54, -4
> dw -2, 10, 58, -2
>
> -tab_ChromaCoeffV: times 4 dw 0, 64
> - times 4 dw 0, 0
> -
> - times 4 dw -2, 58
> - times 4 dw 10, -2
> -
> - times 4 dw -4, 54
> - times 4 dw 16, -2
> -
> - times 4 dw -6, 46
> - times 4 dw 28, -4
> -
> - times 4 dw -4, 36
> - times 4 dw 36, -4
> -
> - times 4 dw -4, 28
> - times 4 dw 46, -6
> -
> - times 4 dw -2, 16
> - times 4 dw 54, -4
> -
> - times 4 dw -2, 10
> - times 4 dw 58, -2
> +const tab_ChromaCoeffV, times 8 dw 0, 64
> + times 8 dw 0, 0
> +
> + times 8 dw -2, 58
> + times 8 dw 10, -2
> +
> + times 8 dw -4, 54
> + times 8 dw 16, -2
> +
> + times 8 dw -6, 46
> + times 8 dw 28, -4
> +
> + times 8 dw -4, 36
> + times 8 dw 36, -4
> +
> + times 8 dw -4, 28
> + times 8 dw 46, -6
> +
> + times 8 dw -2, 16
> + times 8 dw 54, -4
> +
> + times 8 dw -2, 10
> + times 8 dw 58, -2
>
> tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0
> dw -1, 4, -10, 58, 17, -5, 1, 0
> @@ -3292,34 +3292,34 @@
> movq m0, [r0]
> movq m1, [r0 + r1]
> punpcklwd m0, m1 ;m0=[0 1]
> - pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
> + pmaddwd m0, [r6 + 0 *32] ;m0=[0+1] Row1
>
> lea r0, [r0 + 2 * r1]
> movq m4, [r0]
> punpcklwd m1, m4 ;m1=[1 2]
> - pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
> + pmaddwd m1, [r6 + 0 *32] ;m1=[1+2] Row2
>
> movq m5, [r0 + r1]
> punpcklwd m4, m5 ;m4=[2 3]
> - pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
> - pmaddwd m4, [r6 + 1 * 16]
> + pmaddwd m2, m4, [r6 + 0 *32] ;m2=[2+3] Row3
> + pmaddwd m4, [r6 + 1 * 32]
> paddd m0, m4 ;m0=[0+1+2+3] Row1 done
>
> lea r0, [r0 + 2 * r1]
> movq m4, [r0]
> punpcklwd m5, m4 ;m5=[3 4]
> - pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
> - pmaddwd m5, [r6 + 1 * 16]
> + pmaddwd m3, m5, [r6 + 0 *32] ;m3=[3+4] Row4
> + pmaddwd m5, [r6 + 1 * 32]
> paddd m1, m5 ;m1 = [1+2+3+4] Row2
>
> movq m5, [r0 + r1]
> punpcklwd m4, m5 ;m4=[4 5]
> - pmaddwd m4, [r6 + 1 * 16]
> + pmaddwd m4, [r6 + 1 * 32]
> paddd m2, m4 ;m2=[2+3+4+5] Row3
>
> movq m4, [r0 + 2 * r1]
> punpcklwd m5, m4 ;m5=[5 6]
> - pmaddwd m5, [r6 + 1 * 16]
> + pmaddwd m5, [r6 + 1 * 32]
> paddd m3, m5 ;m3=[3+4+5+6] Row4
> %endmacro
>
> @@ -4022,7 +4022,7 @@
> add r1d, r1d
> add r3d, r3d
> sub r0, r1
> - shl r4d, 5
> + shl r4d, 6
>
> %ifdef PIC
> lea r5, [tab_ChromaCoeffV]
> @@ -4243,7 +4243,7 @@
> movd m2, [r0]
> punpcklwd m1, m2 ;m1=[1 2]
> punpcklqdq m0, m1 ;m0=[0 1 1 2]
> - pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2
> + pmaddwd m0, [%1 + 0 *32] ;m0=[0+1 1+2] Row 1-2
>
> movd m1, [r0 + r1]
> punpcklwd m2, m1 ;m2=[2 3]
> @@ -4253,8 +4253,8 @@
> punpcklwd m1, m3 ;m2=[3 4]
> punpcklqdq m2, m1 ;m2=[2 3 3 4]
>
> - pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2
> - pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4
> + pmaddwd m4, m2, [%1 + 1 * 32] ;m4=[2+3 3+4] Row 1-2
> + pmaddwd m2, [%1 + 0 * 32] ;m2=[2+3 3+4] Row 3-4
> paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row
> 1-2
>
> movd m1, [r0 + r1]
> @@ -4263,7 +4263,7 @@
> movd m4, [r0 + 2 * r1]
> punpcklwd m1, m4 ;m1=[5 6]
> punpcklqdq m3, m1 ;m2=[4 5 5 6]
> - pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4
> + pmaddwd m3, [%1 + 1 * 32] ;m3=[4+5 5+6] Row 3-4
> paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row
> 3-4
> %endmacro
>
> @@ -4277,7 +4277,7 @@
> add r1d, r1d
> add r3d, r3d
> sub r0, r1
> - shl r4d, 5
> + shl r4d, 6
>
> %ifdef PIC
> lea r5, [tab_ChromaCoeffV]
> @@ -4369,7 +4369,7 @@
> add r1d, r1d
> add r3d, r3d
> sub r0, r1
> - shl r4d, 5
> + shl r4d, 6
>
> %ifdef PIC
> lea r5, [tab_ChromaCoeffV]
> @@ -4403,21 +4403,21 @@
> movh m0, [r0]
> movh m1, [r0 + r1]
> punpcklwd m0, m1 ;m0=[0 1]
> - pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
> + pmaddwd m0, [r5 + 0 *32] ;m0=[0+1] Row1
>
> lea r0, [r0 + 2 * r1]
> movh m2, [r0]
> punpcklwd m1, m2 ;m1=[1 2]
> - pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
> + pmaddwd m1, [r5 + 0 *32] ;m1=[1+2] Row2
>
> movh m3, [r0 + r1]
> punpcklwd m2, m3 ;m4=[2 3]
> - pmaddwd m2, [r5 + 1 * 16]
> + pmaddwd m2, [r5 + 1 * 32]
> paddd m0, m2 ;m0=[0+1+2+3] Row1 done
>
> movh m2, [r0 + 2 * r1]
> punpcklwd m3, m2 ;m5=[3 4]
> - pmaddwd m3, [r5 + 1 * 16]
> + pmaddwd m3, [r5 + 1 * 32]
> paddd m1, m3 ;m1=[1+2+3+4] Row2 done
>
> %ifidn %2, ss
> @@ -4476,7 +4476,7 @@
> add r1d, r1d
> add r3d, r3d
> sub r0, r1
> - shl r4d, 5
> + shl r4d, 6
>
> %ifdef PIC
> lea r5, [tab_ChromaCoeffV]
> @@ -4610,31 +4610,31 @@
> movu m1, [r0]
> movu m3, [r0 + r1]
> punpcklwd m0, m1, m3
> - pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l
> + pmaddwd m0, [r5 + 0 * 32] ;m0 = [0l+1l] Row1l
> punpckhwd m1, m3
> - pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h
> + pmaddwd m1, [r5 + 0 * 32] ;m1 = [0h+1h] Row1h
>
> movu m4, [r0 + 2 * r1]
> punpcklwd m2, m3, m4
> - pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l
> + pmaddwd m2, [r5 + 0 * 32] ;m2 = [1l+2l] Row2l
> punpckhwd m3, m4
> - pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h
> + pmaddwd m3, [r5 + 0 * 32] ;m3 = [1h+2h] Row2h
>
> lea r0, [r0 + 2 * r1]
> movu m5, [r0 + r1]
> punpcklwd m6, m4, m5
> - pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l
> + pmaddwd m6, [r5 + 1 * 32] ;m6 = [2l+3l] Row1l
> paddd m0, m6 ;m0 = [0l+1l+2l+3l]
> Row1l sum
> punpckhwd m4, m5
> - pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h
> + pmaddwd m4, [r5 + 1 * 32] ;m6 = [2h+3h] Row1h
> paddd m1, m4 ;m1 = [0h+1h+2h+3h]
> Row1h sum
>
> movu m4, [r0 + 2 * r1]
> punpcklwd m6, m5, m4
> - pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l
> + pmaddwd m6, [r5 + 1 * 32] ;m6 = [3l+4l] Row2l
> paddd m2, m6 ;m2 = [1l+2l+3l+4l]
> Row2l sum
> punpckhwd m5, m4
> - pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h
> + pmaddwd m5, [r5 + 1 * 32] ;m1 = [3h+4h] Row2h
> paddd m3, m5 ;m3 = [1h+2h+3h+4h]
> Row2h sum
> %endmacro
>
> @@ -4648,7 +4648,7 @@
> add r1d, r1d
> add r3d, r3d
> sub r0, r1
> - shl r4d, 5
> + shl r4d, 6
>
> %ifdef PIC
> lea r5, [tab_ChromaCoeffV]
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150609/61bde4bd/attachment-0001.html>
More information about the x265-devel
mailing list