[x265] [PATCH] asm: 10bpp code for scale2D_64to32 routine
chen
chenm003 at 163.com
Thu Dec 5 10:22:21 CET 2013
Applyed with little modify:
pshufb m0, m0, m7
in Intel instruction documents, pshufb have two parameters only, three parameters style for AVX and have a extra code byte,
so I suggest use two parameters style when you are not really need hide register move
At 2013-12-05 15:59:24,murugan at multicorewareinc.com wrote:
># HG changeset patch
># User Murugan Vairavel <murugan at multicorewareinc.com>
># Date 1386230342 -19800
># Thu Dec 05 13:29:02 2013 +0530
># Node ID dbfde5222782eec2ba414d473fd4ba2494c6f333
># Parent e4a7885f377e37841c3ecd8e2419454fa1ba03db
>asm: 10bpp code for scale2D_64to32 routine
>
>diff -r e4a7885f377e -r dbfde5222782 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Wed Dec 04 13:45:29 2013 -0600
>+++ b/source/common/x86/asm-primitives.cpp Thu Dec 05 13:29:02 2013 +0530
>@@ -567,6 +567,7 @@
> if (cpuMask & X265_CPU_SSSE3)
> {
> p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
>+ p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
> }
> if (cpuMask & X265_CPU_SSE4)
> {
>diff -r e4a7885f377e -r dbfde5222782 source/common/x86/pixel-util8.asm
>--- a/source/common/x86/pixel-util8.asm Wed Dec 04 13:45:29 2013 -0600
>+++ b/source/common/x86/pixel-util8.asm Thu Dec 05 13:29:02 2013 +0530
>@@ -47,6 +47,8 @@
> deinterleave_word_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 15, 15
> hmul_16p: times 16 db 1
> times 8 db 1, -1
>+hmulw_16p: times 8 dw 1
>+ times 4 dw 1, -1
>
> SECTION .text
>
>@@ -1797,9 +1799,173 @@
> ;-----------------------------------------------------------------
> INIT_XMM ssse3
> cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
>-
>+ mov r3d, 32
>+%if HIGH_BIT_DEPTH
>+ mova m7, [deinterleave_word_shuf]
>+ add r2, r2
>+.loop
>+ movu m0, [r1] ;i
>+ movu m1, [r1 + 2] ;j
>+ movu m2, [r1 + r2] ;k
>+ movu m3, [r1 + r2 + 2] ;l
>+ movu m4, m0
>+ movu m5, m2
>+ pxor m4, m1 ;i^j
>+ pxor m5, m3 ;k^l
>+ por m4, m5 ;ij|kl
>+ pavgw m0, m1 ;s
>+ pavgw m2, m3 ;t
>+ movu m5, m0
>+ pavgw m0, m2 ;(s+t+1)/2
>+ pxor m5, m2 ;s^t
>+ pand m4, m5 ;(ij|kl)&st
>+ pand m4, [hmulw_16p]
>+ psubw m0, m4 ;Result
>+ movu m1, [r1 + 16] ;i
>+ movu m2, [r1 + 16 + 2] ;j
>+ movu m3, [r1 + r2 + 16] ;k
>+ movu m4, [r1 + r2 + 16 + 2] ;l
>+ movu m5, m1
>+ movu m6, m3
>+ pxor m5, m2 ;i^j
>+ pxor m6, m4 ;k^l
>+ por m5, m6 ;ij|kl
>+ pavgw m1, m2 ;s
>+ pavgw m3, m4 ;t
>+ movu m6, m1
>+ pavgw m1, m3 ;(s+t+1)/2
>+ pxor m6, m3 ;s^t
>+ pand m5, m6 ;(ij|kl)&st
>+ pand m5, [hmulw_16p]
>+ psubw m1, m5 ;Result
>+ pshufb m0, m0, m7
>+ pshufb m1, m1, m7
>+
>+ punpcklqdq m0, m1
>+ movu [r0], m0
>+
>+ movu m0, [r1 + 32] ;i
>+ movu m1, [r1 + 32 + 2] ;j
>+ movu m2, [r1 + r2 + 32] ;k
>+ movu m3, [r1 + r2 + 32 + 2] ;l
>+ movu m4, m0
>+ movu m5, m2
>+ pxor m4, m1 ;i^j
>+ pxor m5, m3 ;k^l
>+ por m4, m5 ;ij|kl
>+ pavgw m0, m1 ;s
>+ pavgw m2, m3 ;t
>+ movu m5, m0
>+ pavgw m0, m2 ;(s+t+1)/2
>+ pxor m5, m2 ;s^t
>+ pand m4, m5 ;(ij|kl)&st
>+ pand m4, [hmulw_16p]
>+ psubw m0, m4 ;Result
>+ movu m1, [r1 + 48] ;i
>+ movu m2, [r1 + 48 + 2] ;j
>+ movu m3, [r1 + r2 + 48] ;k
>+ movu m4, [r1 + r2 + 48 + 2] ;l
>+ movu m5, m1
>+ movu m6, m3
>+ pxor m5, m2 ;i^j
>+ pxor m6, m4 ;k^l
>+ por m5, m6 ;ij|kl
>+ pavgw m1, m2 ;s
>+ pavgw m3, m4 ;t
>+ movu m6, m1
>+ pavgw m1, m3 ;(s+t+1)/2
>+ pxor m6, m3 ;s^t
>+ pand m5, m6 ;(ij|kl)&st
>+ pand m5, [hmulw_16p]
>+ psubw m1, m5 ;Result
>+ pshufb m0, m0, m7
>+ pshufb m1, m1, m7
>+
>+ punpcklqdq m0, m1
>+ movu [r0 + 16], m0
>+
>+ movu m0, [r1 + 64] ;i
>+ movu m1, [r1 + 64 + 2] ;j
>+ movu m2, [r1 + r2 + 64] ;k
>+ movu m3, [r1 + r2 + 64 + 2] ;l
>+ movu m4, m0
>+ movu m5, m2
>+ pxor m4, m1 ;i^j
>+ pxor m5, m3 ;k^l
>+ por m4, m5 ;ij|kl
>+ pavgw m0, m1 ;s
>+ pavgw m2, m3 ;t
>+ movu m5, m0
>+ pavgw m0, m2 ;(s+t+1)/2
>+ pxor m5, m2 ;s^t
>+ pand m4, m5 ;(ij|kl)&st
>+ pand m4, [hmulw_16p]
>+ psubw m0, m4 ;Result
>+ movu m1, [r1 + 80] ;i
>+ movu m2, [r1 + 80 + 2] ;j
>+ movu m3, [r1 + r2 + 80] ;k
>+ movu m4, [r1 + r2 + 80 + 2] ;l
>+ movu m5, m1
>+ movu m6, m3
>+ pxor m5, m2 ;i^j
>+ pxor m6, m4 ;k^l
>+ por m5, m6 ;ij|kl
>+ pavgw m1, m2 ;s
>+ pavgw m3, m4 ;t
>+ movu m6, m1
>+ pavgw m1, m3 ;(s+t+1)/2
>+ pxor m6, m3 ;s^t
>+ pand m5, m6 ;(ij|kl)&st
>+ pand m5, [hmulw_16p]
>+ psubw m1, m5 ;Result
>+ pshufb m0, m0, m7
>+ pshufb m1, m1, m7
>+
>+ punpcklqdq m0, m1
>+ movu [r0 + 32], m0
>+
>+ movu m0, [r1 + 96] ;i
>+ movu m1, [r1 + 96 + 2] ;j
>+ movu m2, [r1 + r2 + 96] ;k
>+ movu m3, [r1 + r2 + 96 + 2] ;l
>+ movu m4, m0
>+ movu m5, m2
>+ pxor m4, m1 ;i^j
>+ pxor m5, m3 ;k^l
>+ por m4, m5 ;ij|kl
>+ pavgw m0, m1 ;s
>+ pavgw m2, m3 ;t
>+ movu m5, m0
>+ pavgw m0, m2 ;(s+t+1)/2
>+ pxor m5, m2 ;s^t
>+ pand m4, m5 ;(ij|kl)&st
>+ pand m4, [hmulw_16p]
>+ psubw m0, m4 ;Result
>+ movu m1, [r1 + 112] ;i
>+ movu m2, [r1 + 112 + 2] ;j
>+ movu m3, [r1 + r2 + 112] ;k
>+ movu m4, [r1 + r2 + 112 + 2] ;l
>+ movu m5, m1
>+ movu m6, m3
>+ pxor m5, m2 ;i^j
>+ pxor m6, m4 ;k^l
>+ por m5, m6 ;ij|kl
>+ pavgw m1, m2 ;s
>+ pavgw m3, m4 ;t
>+ movu m6, m1
>+ pavgw m1, m3 ;(s+t+1)/2
>+ pxor m6, m3 ;s^t
>+ pand m5, m6 ;(ij|kl)&st
>+ pand m5, [hmulw_16p]
>+ psubw m1, m5 ;Result
>+ pshufb m0, m0, m7
>+ pshufb m1, m1, m7
>+
>+ punpcklqdq m0, m1
>+ movu [r0 + 48], m0
>+ lea r0, [r0 + 64]
>+%else
> mova m7, [deinterleave_shuf]
>- mov r3d, 32
> .loop
>
> movu m0, [r1] ;i
>@@ -1895,9 +2061,9 @@
> movu [r0 + 16], m0
>
> lea r0, [r0 + 32]
>+%endif
> lea r1, [r1 + 2 * r2]
> dec r3d
>-
> jnz .loop
>
> RET
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131205/773e1691/attachment-0001.html>
More information about the x265-devel
mailing list