[x265] [PATCH 3 of 3] asm: code for input pixel upShift/downShift
chen
chenm003 at 163.com
Mon Mar 24 16:58:47 CET 2014
On Fri, Mar 21, 2014 at 8:01 PM, chen <chenm003 at 163.com> wrote:
At 2014-03-21 13:35:31,murugan at multicorewareinc.com wrote:
># HG changeset patch
># User Murugan Vairavel <murugan at multicorewareinc.com>
># Date 1395379456 -19800
># Fri Mar 21 10:54:16 2014 +0530
># Node ID 29728f7728591116192575d411ef2db2dff49c18
># Parent 435e50b2b92c83e10fdb2bd86bc8e8df91b7338b
>asm: code for input pixel upShift/downShift
>
>+; Input 10bpp, Output 8bpp, width is multiple of 16
>+;------------------------------------------------------------------------------------------------------------------------
>+;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
>+;------------------------------------------------------------------------------------------------------------------------
>+INIT_XMM sse2
>+cglobal downShift_10, 7,7,3
>+ movd m0, r6d ; m0 = shift
>+ add r1, r1
>+ dec r5d>+.loopH:>
>+ xor r6, r6
tip: r6 is a offset, when you do prepare 'r1=r1-r4', you may direct operator on r0But the pixels processed in each row is not equal to the width(r4), in case the width is not a multiple of 16. If i do it as above then the output mismatch will occur. Your algorithm do a loop that width multiple of 16 except last one, you need not to modify this part now, just for you information.
>+.loopW:
>+ movu m1, [r0 + r6 * 2]
>+ movu m2, [r0 + r6 * 2 + 16]
>+ psrlw m1, m0
>+ psrlw m2, m0
>+ packuswb m1, m2
>+ movu [r2 + r6], m1
>+
>+ add r6, 16
>+ cmp r6d, r4d
>+ jl .loopW
>+
>+ ; move to next row
>+ lea r0, [r0 + r1]
>+ lea r2, [r2 + r3]
add r0,r1
add r2,r3I will modify that.
>+ dec r5d
>+ jnz .loopH
>+
>+;processing last row of every frame [To handle width which not a multiple of 16]
>+
>+.loop16:
>+ movu m1, [r0]
>+ movu m2, [r0 + 16]
>+ psrlw m1, m0
>+ psrlw m2, m0
>+ packuswb m1, m2
>+ movu [r2], m1
>+
>+ add r0, 2 * mmsize
>+ add r2, mmsize
>+ sub r4d, 16
>+ jz .end
>+ cmp r4d, 15
>+ jg .loop16
--> (X > 16) && (X >15) ??
means??
r4d = X
sub r4d,16 & cmp & jz -> (X-16 == 0)
cmp r4d, 15 & jg -> (X-16 > 15) <--- here logic a little problem, it's right but reduce, when it is true, means (x-16>=16) -_-!
>+ cmp r4d, 8
>+ jl .process4
>+ movu m1, [r0]
>+ psrlw m1, m0
>+ packuswb m1, m1
>+ movh [r2], m1
>+
>+ add r0, mmsize
>+ add r2, 8
>+ sub r4d, 8
>+ jz .end
>+
>+.process4:
>+ cmp r4d, 4
>+ jl .process2
>+ movh m1,[r0]
>+ psrlw m1, m0
>+ packuswb m1, m1
>+ movd [r2], m1
>+
>+ add r0, 8
>+ add r2, 4
>+ sub r4d, 4
>+ jz .end
>+
>+.process2:
>+ cmp r4d, 2
>+ jl .process1
>+ movd m1, [r0]
>+ psrlw m1, m0
>+ packuswb m1, m1
>+ movd r6, m1
>+ mov [r2], r6w
>+
>+ add r0, 4
>+ add r2, 2
>+ sub r4d, 2
>+ jz .end
>+
>+.process1:
>+ movd m1, [r0]
>+ psrlw m1, m0
>+ packuswb m1, m1
>+ movd r6, m1
>+ mov [r2], r6b
>+.end:
>+ RET
(4, 2, 1) pixels path may share calculate code
Do you mean defining a macro for that??
No, last 16 or 8 pixel may cover all of case (result in different part in register), so we need not to calculate many times.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140324/8864aedb/attachment-0001.html>
More information about the x265-devel
mailing list