[x265] [PATCH] asm: 10bpp code for scale1D_128to64 module

murugan at multicorewareinc.com murugan at multicorewareinc.com
Wed Dec 4 14:15:35 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386162865 -19800
#      Wed Dec 04 18:44:25 2013 +0530
# Node ID a525d3fde24f8c076def1b67122e6f7f69e60d35
# Parent  4347192eae502a5f963d7e79655ba753e677b58b
asm: 10bpp code for scale1D_128to64 module

diff -r 4347192eae50 -r a525d3fde24f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Dec 04 17:11:43 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Dec 04 18:44:25 2013 +0530
@@ -576,6 +576,7 @@
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
+        p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
     {
diff -r 4347192eae50 -r a525d3fde24f source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Dec 04 17:11:43 2013 +0530
+++ b/source/common/x86/pixel-util8.asm	Wed Dec 04 18:44:25 2013 +0530
@@ -44,6 +44,7 @@
 mask_ff:   times 16 db 0xff
            times 16 db 0
 deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+deinterleave_word_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 15, 15
 hmul_16p:  times 16 db 1
            times 8 db 1, -1
 
@@ -1643,7 +1644,98 @@
 ;-----------------------------------------------------------------
 INIT_XMM ssse3
 cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride
-
+%if HIGH_BIT_DEPTH
+    mova        m7,      [deinterleave_word_shuf]
+
+    movu        m0,      [r1]
+    palignr     m1,      m0,    2
+    movu        m2,      [r1 + 16]
+    palignr     m3,      m2,    2
+    movu        m4,      [r1 + 32]
+    palignr     m5,      m4,    2
+    movu        m6,      [r1 + 48]
+    pavgw       m0,      m1
+    palignr     m1,      m6,    2
+    pavgw       m2,      m3
+    pavgw       m4,      m5
+    pavgw       m6,      m1
+    pshufb      m0,      m0,    m7
+    pshufb      m2,      m2,    m7
+    pshufb      m4,      m4,    m7
+    pshufb      m6,      m6,    m7
+    punpcklqdq    m0,           m2
+    movu          [r0],         m0
+    punpcklqdq    m4,           m6
+    movu          [r0 + 16],    m4
+
+
+
+    movu        m0,      [r1 + 64]
+    palignr     m1,      m0,    2
+    movu        m2,      [r1 + 80]
+    palignr     m3,      m2,    2
+    movu        m4,      [r1 + 96]
+    palignr     m5,      m4,    2
+    movu        m6,      [r1 + 112]
+    pavgw       m0,      m1
+    palignr     m1,      m6,    2
+    pavgw       m2,      m3
+    pavgw       m4,      m5
+    pavgw       m6,      m1
+    pshufb      m0,      m0,    m7
+    pshufb      m2,      m2,    m7
+    pshufb      m4,      m4,    m7
+    pshufb      m6,      m6,    m7
+    punpcklqdq    m0,           m2
+    movu          [r0 + 32],    m0
+    punpcklqdq    m4,           m6
+    movu          [r0 + 48],    m4
+
+    movu        m0,      [r1 + 128]
+    palignr     m1,      m0,    2
+    movu        m2,      [r1 + 144]
+    palignr     m3,      m2,    2
+    movu        m4,      [r1 + 160]
+    palignr     m5,      m4,    2
+    movu        m6,      [r1 + 176]
+    pavgw       m0,      m1
+    palignr     m1,      m6,    2
+    pavgw       m2,      m3
+    pavgw       m4,      m5
+    pavgw       m6,      m1
+    pshufb      m0,      m0,    m7
+    pshufb      m2,      m2,    m7
+    pshufb      m4,      m4,    m7
+    pshufb      m6,      m6,    m7
+
+    punpcklqdq    m0,           m2
+    movu          [r0 + 64],    m0
+    punpcklqdq    m4,           m6
+    movu          [r0 + 80],    m4
+
+    movu        m0,      [r1 + 192]
+    palignr     m1,      m0,    2
+    movu        m2,      [r1 + 208]
+    palignr     m3,      m2,    2
+    movu        m4,      [r1 + 224]
+    palignr     m5,      m4,    2
+    movu        m6,      [r1 + 240]
+    pavgw       m0,      m1
+    palignr     m1,      m6,    2
+    pavgw       m2,      m3
+    pavgw       m4,      m5
+    pavgw       m6,      m1
+    pshufb      m0,      m0,    m7
+    pshufb      m2,      m2,    m7
+    pshufb      m4,      m4,    m7
+    pshufb      m6,      m6,    m7
+
+    punpcklqdq    m0,           m2
+    movu          [r0 + 96],    m0
+    punpcklqdq    m4,           m6
+    movu          [r0 + 112],    m4
+
+%else
     mova        m7,      [deinterleave_shuf]
 
     movu        m0,      [r1]
@@ -1697,7 +1789,7 @@
     movu          [r0 + 32],    m0
     punpcklqdq    m4,           m6
     movu          [r0 + 48],    m4
-
+%endif
 RET
 
 ;-----------------------------------------------------------------


More information about the x265-devel mailing list