[x265] [PATCH] asm: 10bpp code for scale2D_64to32 routine

chen chenm003 at 163.com
Thu Dec 5 10:22:21 CET 2013


Applyed with little modify:
pshufb    m0,    m0,    m7
 
in Intel instruction documents, pshufb have two parameters only, three parameters style for AVX and have a extra code byte,
so I suggest use two parameters style when you are not really need hide register move

At 2013-12-05 15:59:24,murugan at multicorewareinc.com wrote:
># HG changeset patch
># User Murugan Vairavel <murugan at multicorewareinc.com>
># Date 1386230342 -19800
>#      Thu Dec 05 13:29:02 2013 +0530
># Node ID dbfde5222782eec2ba414d473fd4ba2494c6f333
># Parent  e4a7885f377e37841c3ecd8e2419454fa1ba03db
>asm: 10bpp code for scale2D_64to32 routine
>
>diff -r e4a7885f377e -r dbfde5222782 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Wed Dec 04 13:45:29 2013 -0600
>+++ b/source/common/x86/asm-primitives.cpp Thu Dec 05 13:29:02 2013 +0530
>@@ -567,6 +567,7 @@
>     if (cpuMask & X265_CPU_SSSE3)
>     {
>         p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
>+        p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
>     }
>     if (cpuMask & X265_CPU_SSE4)
>     {
>diff -r e4a7885f377e -r dbfde5222782 source/common/x86/pixel-util8.asm
>--- a/source/common/x86/pixel-util8.asm Wed Dec 04 13:45:29 2013 -0600
>+++ b/source/common/x86/pixel-util8.asm Thu Dec 05 13:29:02 2013 +0530
>@@ -47,6 +47,8 @@
> deinterleave_word_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 15, 15
> hmul_16p:  times 16 db 1
>            times 8 db 1, -1
>+hmulw_16p:  times 8 dw 1
>+            times 4 dw 1, -1
> 
> SECTION .text
> 
>@@ -1797,9 +1799,173 @@
> ;-----------------------------------------------------------------
> INIT_XMM ssse3
> cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
>-
>+    mov       r3d,    32
>+%if HIGH_BIT_DEPTH
>+    mova      m7,    [deinterleave_word_shuf]
>+    add       r2,    r2
>+.loop
>+    movu      m0,    [r1]                  ;i
>+    movu      m1,    [r1 + 2]              ;j
>+    movu      m2,    [r1 + r2]             ;k
>+    movu      m3,    [r1 + r2 + 2]         ;l
>+    movu      m4,    m0
>+    movu      m5,    m2
>+    pxor      m4,    m1                    ;i^j
>+    pxor      m5,    m3                    ;k^l
>+    por       m4,    m5                    ;ij|kl
>+    pavgw     m0,    m1                    ;s
>+    pavgw     m2,    m3                    ;t
>+    movu      m5,    m0
>+    pavgw     m0,    m2                    ;(s+t+1)/2
>+    pxor      m5,    m2                    ;s^t
>+    pand      m4,    m5                    ;(ij|kl)&st
>+    pand      m4,    [hmulw_16p]
>+    psubw     m0,    m4                    ;Result
>+    movu      m1,    [r1 + 16]             ;i
>+    movu      m2,    [r1 + 16 + 2]         ;j
>+    movu      m3,    [r1 + r2 + 16]        ;k
>+    movu      m4,    [r1 + r2 + 16 + 2]    ;l
>+    movu      m5,    m1
>+    movu      m6,    m3
>+    pxor      m5,    m2                    ;i^j
>+    pxor      m6,    m4                    ;k^l
>+    por       m5,    m6                    ;ij|kl
>+    pavgw     m1,    m2                    ;s
>+    pavgw     m3,    m4                    ;t
>+    movu      m6,    m1
>+    pavgw     m1,    m3                    ;(s+t+1)/2
>+    pxor      m6,    m3                    ;s^t
>+    pand      m5,    m6                    ;(ij|kl)&st
>+    pand      m5,    [hmulw_16p]
>+    psubw     m1,    m5                    ;Result
>+    pshufb    m0,    m0,    m7
>+    pshufb    m1,    m1,    m7
>+
>+    punpcklqdq    m0,       m1
>+    movu          [r0],     m0
>+
>+    movu      m0,    [r1 + 32]             ;i
>+    movu      m1,    [r1 + 32 + 2]         ;j
>+    movu      m2,    [r1 + r2 + 32]        ;k
>+    movu      m3,    [r1 + r2 + 32 + 2]    ;l
>+    movu      m4,    m0
>+    movu      m5,    m2
>+    pxor      m4,    m1                    ;i^j
>+    pxor      m5,    m3                    ;k^l
>+    por       m4,    m5                    ;ij|kl
>+    pavgw     m0,    m1                    ;s
>+    pavgw     m2,    m3                    ;t
>+    movu      m5,    m0
>+    pavgw     m0,    m2                    ;(s+t+1)/2
>+    pxor      m5,    m2                    ;s^t
>+    pand      m4,    m5                    ;(ij|kl)&st
>+    pand      m4,    [hmulw_16p]
>+    psubw     m0,    m4                    ;Result
>+    movu      m1,    [r1 + 48]             ;i
>+    movu      m2,    [r1 + 48 + 2]         ;j
>+    movu      m3,    [r1 + r2 + 48]        ;k
>+    movu      m4,    [r1 + r2 + 48 + 2]    ;l
>+    movu      m5,    m1
>+    movu      m6,    m3
>+    pxor      m5,    m2                    ;i^j
>+    pxor      m6,    m4                    ;k^l
>+    por       m5,    m6                    ;ij|kl
>+    pavgw     m1,    m2                    ;s
>+    pavgw     m3,    m4                    ;t
>+    movu      m6,    m1
>+    pavgw     m1,    m3                    ;(s+t+1)/2
>+    pxor      m6,    m3                    ;s^t
>+    pand      m5,    m6                    ;(ij|kl)&st
>+    pand      m5,    [hmulw_16p]
>+    psubw     m1,    m5                    ;Result
>+    pshufb    m0,    m0,    m7
>+    pshufb    m1,    m1,    m7
>+
>+    punpcklqdq    m0,           m1
>+    movu          [r0 + 16],    m0
>+
>+    movu      m0,    [r1 + 64]             ;i
>+    movu      m1,    [r1 + 64 + 2]         ;j
>+    movu      m2,    [r1 + r2 + 64]        ;k
>+    movu      m3,    [r1 + r2 + 64 + 2]    ;l
>+    movu      m4,    m0
>+    movu      m5,    m2
>+    pxor      m4,    m1                    ;i^j
>+    pxor      m5,    m3                    ;k^l
>+    por       m4,    m5                    ;ij|kl
>+    pavgw     m0,    m1                    ;s
>+    pavgw     m2,    m3                    ;t
>+    movu      m5,    m0
>+    pavgw     m0,    m2                    ;(s+t+1)/2
>+    pxor      m5,    m2                    ;s^t
>+    pand      m4,    m5                    ;(ij|kl)&st
>+    pand      m4,    [hmulw_16p]
>+    psubw     m0,    m4                    ;Result
>+    movu      m1,    [r1 + 80]             ;i
>+    movu      m2,    [r1 + 80 + 2]         ;j
>+    movu      m3,    [r1 + r2 + 80]        ;k
>+    movu      m4,    [r1 + r2 + 80 + 2]    ;l
>+    movu      m5,    m1
>+    movu      m6,    m3
>+    pxor      m5,    m2                    ;i^j
>+    pxor      m6,    m4                    ;k^l
>+    por       m5,    m6                    ;ij|kl
>+    pavgw     m1,    m2                    ;s
>+    pavgw     m3,    m4                    ;t
>+    movu      m6,    m1
>+    pavgw     m1,    m3                    ;(s+t+1)/2
>+    pxor      m6,    m3                    ;s^t
>+    pand      m5,    m6                    ;(ij|kl)&st
>+    pand      m5,    [hmulw_16p]
>+    psubw     m1,    m5                    ;Result
>+    pshufb    m0,    m0,    m7
>+    pshufb    m1,    m1,    m7
>+
>+    punpcklqdq    m0,           m1
>+    movu          [r0 + 32],    m0
>+
>+    movu      m0,    [r1 + 96]             ;i
>+    movu      m1,    [r1 + 96 + 2]         ;j
>+    movu      m2,    [r1 + r2 + 96]        ;k
>+    movu      m3,    [r1 + r2 + 96 + 2]    ;l
>+    movu      m4,    m0
>+    movu      m5,    m2
>+    pxor      m4,    m1                    ;i^j
>+    pxor      m5,    m3                    ;k^l
>+    por       m4,    m5                    ;ij|kl
>+    pavgw     m0,    m1                    ;s
>+    pavgw     m2,    m3                    ;t
>+    movu      m5,    m0
>+    pavgw     m0,    m2                    ;(s+t+1)/2
>+    pxor      m5,    m2                    ;s^t
>+    pand      m4,    m5                    ;(ij|kl)&st
>+    pand      m4,    [hmulw_16p]
>+    psubw     m0,    m4                    ;Result
>+    movu      m1,    [r1 + 112]             ;i
>+    movu      m2,    [r1 + 112 + 2]         ;j
>+    movu      m3,    [r1 + r2 + 112]        ;k
>+    movu      m4,    [r1 + r2 + 112 + 2]    ;l
>+    movu      m5,    m1
>+    movu      m6,    m3
>+    pxor      m5,    m2                    ;i^j
>+    pxor      m6,    m4                    ;k^l
>+    por       m5,    m6                    ;ij|kl
>+    pavgw     m1,    m2                    ;s
>+    pavgw     m3,    m4                    ;t
>+    movu      m6,    m1
>+    pavgw     m1,    m3                    ;(s+t+1)/2
>+    pxor      m6,    m3                    ;s^t
>+    pand      m5,    m6                    ;(ij|kl)&st
>+    pand      m5,    [hmulw_16p]
>+    psubw     m1,    m5                    ;Result
>+    pshufb    m0,    m0,    m7
>+    pshufb    m1,    m1,    m7
>+
>+    punpcklqdq    m0,           m1
>+    movu          [r0 + 48],    m0
>+    lea    r0,    [r0 + 64]
>+%else
>     mova        m7,      [deinterleave_shuf]
>-    mov         r3d,     32
> .loop
> 
>     movu        m0,      [r1]                  ;i
>@@ -1895,9 +2061,9 @@
>     movu          [r0 + 16],    m0
> 
>     lea    r0,    [r0 + 32]
>+%endif
>     lea    r1,    [r1 + 2 * r2]
>     dec    r3d
>-
>     jnz    .loop
> 
> RET
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131205/773e1691/attachment-0001.html>


More information about the x265-devel mailing list