[x265] [PATCH Review only] asm: code for scale2D_64to32 routine
chen
chenm003 at 163.com
Fri Nov 15 16:50:16 CET 2013
please send full patch again
the code is right, but don't send patch for patch
At 2013-11-15 22:59:51,murugan at multicorewareinc.com wrote:
># HG changeset patch
># User Murugan Vairavel <murugan at multicorewareinc.com>
># Date 1384527565 -19800
># Fri Nov 15 20:29:25 2013 +0530
># Node ID c795b4a126875f24d9df0bfeb48ab28396c9d7c8
># Parent 939b58fa36f56506f32ad761f6c3df72e20e0f2b
>asm: code for scale2D_64to32 routine
>
>diff -r 939b58fa36f5 -r c795b4a12687 source/common/x86/pixel-a.asm
>--- a/source/common/x86/pixel-a.asm Thu Nov 14 20:21:29 2013 +0530
>+++ b/source/common/x86/pixel-a.asm Fri Nov 15 20:29:25 2013 +0530
>@@ -6848,101 +6848,112 @@
>
> RET
>
>-;-----------------------------------------------------------------
>-; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
>-;-----------------------------------------------------------------
>-INIT_XMM ssse3
>-cglobal scale2D_64to32, 3, 7, 8, dest, src, stride
>-
>- mova m7, [pw_00ff]
>- mova m6, [pw_2]
>- xor r3, r3
>- mov r6d, 32
>-.loop
>-
>- mov r4, r3
>- imul r4, r2
>-
>- mov r5, r3
>- inc r5
>- imul r5, r2
>-
>- movu m0, [r1 + r4]
>- palignr m1, m0, 1
>- movu m2, [r1 + r5]
>- palignr m3, m2, 1
>-
>- pand m0, m7
>- pand m1, m7
>- pand m2, m7
>- pand m3, m7
>-
>- paddusw m0, m1
>- paddusw m0, m2
>- paddusw m0, m3
>- paddusw m0, m6
>-
>- psrlw m0, 2
>-
>- movu m4, [r1 + r4 + 16]
>- palignr m5, m4, 1
>- movu m1, [r1 + r5 + 16]
>- palignr m2, m1, 1
>-
>- pand m4, m7
>- pand m5, m7
>- pand m1, m7
>- pand m2, m7
>-
>- paddusw m4, m5
>- paddusw m4, m1
>- paddusw m4, m2
>- paddusw m4, m6
>- psrlw m4, 2
>-
>- packuswb m0, m4
>- movu [r0], m0
>-
>- movu m0, [r1 + r4 + 32]
>- palignr m1, m0, 1
>- movu m2, [r1 + r5 + 32]
>- palignr m3, m2, 1
>-
>- pand m0, m7
>- pand m1, m7
>- pand m2, m7
>- pand m3, m7
>-
>- paddusw m0, m1
>- paddusw m0, m2
>- paddusw m0, m3
>- paddusw m0, m6
>-
>- psrlw m0, 2
>-
>- movu m4, [r1 + r4 + 48]
>- palignr m5, m4, 1
>- movu m1, [r1 + r5 + 48]
>- palignr m2, m1, 1
>-
>- pand m4, m7
>- pand m5, m7
>- pand m1, m7
>- pand m2, m7
>-
>- paddusw m4, m5
>- paddusw m4, m1
>- paddusw m4, m2
>- paddusw m4, m6
>- psrlw m4, 2
>-
>- packuswb m0, m4
>- movu [r0 + 16], m0
>-
>- lea r0, [r0 + 32]
>- add r3, 2
>- dec r6d
>-
>- jnz .loop
>-
>+;-----------------------------------------------------------------
>+; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
>+;-----------------------------------------------------------------
>+INIT_XMM ssse3
>+cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
>+
>+ mova m7, [deinterleave_shuf]
>+ mov r3d, 32
>+.loop
>+
>+ movu m0, [r1] ;i
>+ palignr m1, m0, 1 ;j
>+ movu m2, [r1 + r2] ;k
>+ palignr m3, m2, 1 ;l
>+ movu m4, m0
>+ movu m5, m2
>+
>+ pxor m4, m1 ;i^j
>+ pxor m5, m3 ;k^l
>+ por m4, m5 ;ij|kl
>+
>+ pavgb m0, m1 ;s
>+ pavgb m2, m3 ;t
>+ movu m5, m0
>+ pavgb m0, m2 ;(s+t+1)/2
>+ pxor m5, m2 ;s^t
>+ pand m4, m5 ;(ij|kl)&st
>+ pand m4, [hmul_16p]
>+ psubb m0, m4 ;Result
>+
>+ movu m1, [r1 + 16] ;i
>+ palignr m2, m1, 1 ;j
>+ movu m3, [r1 + r2 + 16] ;k
>+ palignr m4, m3, 1 ;l
>+ movu m5, m1
>+ movu m6, m3
>+
>+ pxor m5, m2 ;i^j
>+ pxor m6, m4 ;k^l
>+ por m5, m6 ;ij|kl
>+
>+ pavgb m1, m2 ;s
>+ pavgb m3, m4 ;t
>+ movu m6, m1
>+ pavgb m1, m3 ;(s+t+1)/2
>+ pxor m6, m3 ;s^t
>+ pand m5, m6 ;(ij|kl)&st
>+ pand m5, [hmul_16p]
>+ psubb m1, m5 ;Result
>+
>+ pshufb m0, m0, m7
>+ pshufb m1, m1, m7
>+
>+ punpcklqdq m0, m1
>+ movu [r0], m0
>+
>+ movu m0, [r1 + 32] ;i
>+ palignr m1, m0, 1 ;j
>+ movu m2, [r1 + r2 + 32] ;k
>+ palignr m3, m2, 1 ;l
>+ movu m4, m0
>+ movu m5, m2
>+
>+ pxor m4, m1 ;i^j
>+ pxor m5, m3 ;k^l
>+ por m4, m5 ;ij|kl
>+
>+ pavgb m0, m1 ;s
>+ pavgb m2, m3 ;t
>+ movu m5, m0
>+ pavgb m0, m2 ;(s+t+1)/2
>+ pxor m5, m2 ;s^t
>+ pand m4, m5 ;(ij|kl)&st
>+ pand m4, [hmul_16p]
>+ psubb m0, m4 ;Result
>+
>+ movu m1, [r1 + 48] ;i
>+ palignr m2, m1, 1 ;j
>+ movu m3, [r1 + r2 + 48] ;k
>+ palignr m4, m3, 1 ;l
>+ movu m5, m1
>+ movu m6, m3
>+
>+ pxor m5, m2 ;i^j
>+ pxor m6, m4 ;k^l
>+ por m5, m6 ;ij|kl
>+
>+ pavgb m1, m2 ;s
>+ pavgb m3, m4 ;t
>+ movu m6, m1
>+ pavgb m1, m3 ;(s+t+1)/2
>+ pxor m6, m3 ;s^t
>+ pand m5, m6 ;(ij|kl)&st
>+ pand m5, [hmul_16p]
>+ psubb m1, m5 ;Result
>+
>+ pshufb m0, m0, m7
>+ pshufb m1, m1, m7
>+
>+ punpcklqdq m0, m1
>+ movu [r0 + 16], m0
>+
>+ lea r0, [r0 + 32]
>+ lea r1, [r1 + 2 * r2]
>+ dec r3d
>+
>+ jnz .loop
>+
> RET
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131115/e2774148/attachment-0001.html>
More information about the x265-devel
mailing list