[x265] [PATCH] asm: 10bpp code for scale2D_64to32 routine

murugan at multicorewareinc.com murugan at multicorewareinc.com
Thu Dec 5 08:59:24 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386230342 -19800
#      Thu Dec 05 13:29:02 2013 +0530
# Node ID dbfde5222782eec2ba414d473fd4ba2494c6f333
# Parent  e4a7885f377e37841c3ecd8e2419454fa1ba03db
asm: 10bpp code for scale2D_64to32 routine

diff -r e4a7885f377e -r dbfde5222782 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Dec 04 13:45:29 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp	Thu Dec 05 13:29:02 2013 +0530
@@ -567,6 +567,7 @@
     if (cpuMask & X265_CPU_SSSE3)
     {
         p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
+        p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
     {
diff -r e4a7885f377e -r dbfde5222782 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Dec 04 13:45:29 2013 -0600
+++ b/source/common/x86/pixel-util8.asm	Thu Dec 05 13:29:02 2013 +0530
@@ -47,6 +47,8 @@
 deinterleave_word_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 15, 15
 hmul_16p:  times 16 db 1
            times 8 db 1, -1
+hmulw_16p:  times 8 dw 1
+            times 4 dw 1, -1
 
 SECTION .text
 
@@ -1797,9 +1799,173 @@
 ;-----------------------------------------------------------------
 INIT_XMM ssse3
 cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
-
+    mov       r3d,    32
+%if HIGH_BIT_DEPTH
+    mova      m7,    [deinterleave_word_shuf]
+    add       r2,    r2
+.loop
+    movu      m0,    [r1]                  ;i
+    movu      m1,    [r1 + 2]              ;j
+    movu      m2,    [r1 + r2]             ;k
+    movu      m3,    [r1 + r2 + 2]         ;l
+    movu      m4,    m0
+    movu      m5,    m2
+    pxor      m4,    m1                    ;i^j
+    pxor      m5,    m3                    ;k^l
+    por       m4,    m5                    ;ij|kl
+    pavgw     m0,    m1                    ;s
+    pavgw     m2,    m3                    ;t
+    movu      m5,    m0
+    pavgw     m0,    m2                    ;(s+t+1)/2
+    pxor      m5,    m2                    ;s^t
+    pand      m4,    m5                    ;(ij|kl)&st
+    pand      m4,    [hmulw_16p]
+    psubw     m0,    m4                    ;Result
+    movu      m1,    [r1 + 16]             ;i
+    movu      m2,    [r1 + 16 + 2]         ;j
+    movu      m3,    [r1 + r2 + 16]        ;k
+    movu      m4,    [r1 + r2 + 16 + 2]    ;l
+    movu      m5,    m1
+    movu      m6,    m3
+    pxor      m5,    m2                    ;i^j
+    pxor      m6,    m4                    ;k^l
+    por       m5,    m6                    ;ij|kl
+    pavgw     m1,    m2                    ;s
+    pavgw     m3,    m4                    ;t
+    movu      m6,    m1
+    pavgw     m1,    m3                    ;(s+t+1)/2
+    pxor      m6,    m3                    ;s^t
+    pand      m5,    m6                    ;(ij|kl)&st
+    pand      m5,    [hmulw_16p]
+    psubw     m1,    m5                    ;Result
+    pshufb    m0,    m0,    m7
+    pshufb    m1,    m1,    m7
+
+    punpcklqdq    m0,       m1
+    movu          [r0],     m0
+
+    movu      m0,    [r1 + 32]             ;i
+    movu      m1,    [r1 + 32 + 2]         ;j
+    movu      m2,    [r1 + r2 + 32]        ;k
+    movu      m3,    [r1 + r2 + 32 + 2]    ;l
+    movu      m4,    m0
+    movu      m5,    m2
+    pxor      m4,    m1                    ;i^j
+    pxor      m5,    m3                    ;k^l
+    por       m4,    m5                    ;ij|kl
+    pavgw     m0,    m1                    ;s
+    pavgw     m2,    m3                    ;t
+    movu      m5,    m0
+    pavgw     m0,    m2                    ;(s+t+1)/2
+    pxor      m5,    m2                    ;s^t
+    pand      m4,    m5                    ;(ij|kl)&st
+    pand      m4,    [hmulw_16p]
+    psubw     m0,    m4                    ;Result
+    movu      m1,    [r1 + 48]             ;i
+    movu      m2,    [r1 + 48 + 2]         ;j
+    movu      m3,    [r1 + r2 + 48]        ;k
+    movu      m4,    [r1 + r2 + 48 + 2]    ;l
+    movu      m5,    m1
+    movu      m6,    m3
+    pxor      m5,    m2                    ;i^j
+    pxor      m6,    m4                    ;k^l
+    por       m5,    m6                    ;ij|kl
+    pavgw     m1,    m2                    ;s
+    pavgw     m3,    m4                    ;t
+    movu      m6,    m1
+    pavgw     m1,    m3                    ;(s+t+1)/2
+    pxor      m6,    m3                    ;s^t
+    pand      m5,    m6                    ;(ij|kl)&st
+    pand      m5,    [hmulw_16p]
+    psubw     m1,    m5                    ;Result
+    pshufb    m0,    m0,    m7
+    pshufb    m1,    m1,    m7
+
+    punpcklqdq    m0,           m1
+    movu          [r0 + 16],    m0
+
+    movu      m0,    [r1 + 64]             ;i
+    movu      m1,    [r1 + 64 + 2]         ;j
+    movu      m2,    [r1 + r2 + 64]        ;k
+    movu      m3,    [r1 + r2 + 64 + 2]    ;l
+    movu      m4,    m0
+    movu      m5,    m2
+    pxor      m4,    m1                    ;i^j
+    pxor      m5,    m3                    ;k^l
+    por       m4,    m5                    ;ij|kl
+    pavgw     m0,    m1                    ;s
+    pavgw     m2,    m3                    ;t
+    movu      m5,    m0
+    pavgw     m0,    m2                    ;(s+t+1)/2
+    pxor      m5,    m2                    ;s^t
+    pand      m4,    m5                    ;(ij|kl)&st
+    pand      m4,    [hmulw_16p]
+    psubw     m0,    m4                    ;Result
+    movu      m1,    [r1 + 80]             ;i
+    movu      m2,    [r1 + 80 + 2]         ;j
+    movu      m3,    [r1 + r2 + 80]        ;k
+    movu      m4,    [r1 + r2 + 80 + 2]    ;l
+    movu      m5,    m1
+    movu      m6,    m3
+    pxor      m5,    m2                    ;i^j
+    pxor      m6,    m4                    ;k^l
+    por       m5,    m6                    ;ij|kl
+    pavgw     m1,    m2                    ;s
+    pavgw     m3,    m4                    ;t
+    movu      m6,    m1
+    pavgw     m1,    m3                    ;(s+t+1)/2
+    pxor      m6,    m3                    ;s^t
+    pand      m5,    m6                    ;(ij|kl)&st
+    pand      m5,    [hmulw_16p]
+    psubw     m1,    m5                    ;Result
+    pshufb    m0,    m0,    m7
+    pshufb    m1,    m1,    m7
+
+    punpcklqdq    m0,           m1
+    movu          [r0 + 32],    m0
+
+    movu      m0,    [r1 + 96]             ;i
+    movu      m1,    [r1 + 96 + 2]         ;j
+    movu      m2,    [r1 + r2 + 96]        ;k
+    movu      m3,    [r1 + r2 + 96 + 2]    ;l
+    movu      m4,    m0
+    movu      m5,    m2
+    pxor      m4,    m1                    ;i^j
+    pxor      m5,    m3                    ;k^l
+    por       m4,    m5                    ;ij|kl
+    pavgw     m0,    m1                    ;s
+    pavgw     m2,    m3                    ;t
+    movu      m5,    m0
+    pavgw     m0,    m2                    ;(s+t+1)/2
+    pxor      m5,    m2                    ;s^t
+    pand      m4,    m5                    ;(ij|kl)&st
+    pand      m4,    [hmulw_16p]
+    psubw     m0,    m4                    ;Result
+    movu      m1,    [r1 + 112]             ;i
+    movu      m2,    [r1 + 112 + 2]         ;j
+    movu      m3,    [r1 + r2 + 112]        ;k
+    movu      m4,    [r1 + r2 + 112 + 2]    ;l
+    movu      m5,    m1
+    movu      m6,    m3
+    pxor      m5,    m2                    ;i^j
+    pxor      m6,    m4                    ;k^l
+    por       m5,    m6                    ;ij|kl
+    pavgw     m1,    m2                    ;s
+    pavgw     m3,    m4                    ;t
+    movu      m6,    m1
+    pavgw     m1,    m3                    ;(s+t+1)/2
+    pxor      m6,    m3                    ;s^t
+    pand      m5,    m6                    ;(ij|kl)&st
+    pand      m5,    [hmulw_16p]
+    psubw     m1,    m5                    ;Result
+    pshufb    m0,    m0,    m7
+    pshufb    m1,    m1,    m7
+
+    punpcklqdq    m0,           m1
+    movu          [r0 + 48],    m0
+    lea    r0,    [r0 + 64]
+%else
     mova        m7,      [deinterleave_shuf]
-    mov         r3d,     32
 .loop
 
     movu        m0,      [r1]                  ;i
@@ -1895,9 +2061,9 @@
     movu          [r0 + 16],    m0
 
     lea    r0,    [r0 + 32]
+%endif
     lea    r1,    [r1 + 2 * r2]
     dec    r3d
-
     jnz    .loop
 
 RET


More information about the x265-devel mailing list