[x265] [PATCH Review only] 10bpp: asm code for pixel_var_32x32 and 64x64

murugan at multicorewareinc.com murugan at multicorewareinc.com
Fri Nov 29 14:33:36 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1385732005 -19800
#      Fri Nov 29 19:03:25 2013 +0530
# Node ID 5049b77614bf1210909c448f201815e1b662d4b9
# Parent  e7a5780843de80575c5489b9827587ef43ab07af
10bpp: asm code for pixel_var_32x32 and 64x64

diff -r e7a5780843de -r 5049b77614bf source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Nov 28 23:30:16 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp	Fri Nov 29 19:03:25 2013 +0530
@@ -457,6 +457,7 @@
 {
 #if HIGH_BIT_DEPTH
     if (cpuMask & X265_CPU_SSE2) p.sa8d[0] = p.sa8d[0];
+    LUMA_VAR(_sse2);
 #else
     if (cpuMask & X265_CPU_SSE2)
     {
diff -r e7a5780843de -r 5049b77614bf source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Thu Nov 28 23:30:16 2013 -0600
+++ b/source/common/x86/pixel-a.asm	Fri Nov 29 19:03:25 2013 +0530
@@ -2327,8 +2327,14 @@
 %if mmsize == 8 && %1*%2 == 256
     HADDUW  m5, m2
 %else
+%if %1 >= 32
+    HADDW     m5,    m2
+    movd      m7,    r4d
+    paddd     m5,    m7
+%else
     HADDW   m5, m2
 %endif
+%endif
 %else ; !HIGH_BIT_DEPTH
 %if %1 == 64
     HADDW     m5,    m2
@@ -2364,14 +2370,14 @@
     paddd     m6, m4
 %endmacro
 
-%macro VAR_2ROW 2
+%macro VAR_2ROW 3
     mov      r2d, %2
-.loop:
+.loop%3:
 %if HIGH_BIT_DEPTH
-    mova      m0, [r0]
-    mova      m1, [r0+mmsize]
-    mova      m3, [r0+%1]
-    mova      m4, [r0+%1+mmsize]
+    movu      m0, [r0]
+    movu      m1, [r0+mmsize]
+    movu      m3, [r0+%1]
+    movu      m4, [r0+%1+mmsize]
 %else ; !HIGH_BIT_DEPTH
     mova      m0, [r0]
     punpckhbw m1, m0, m7
@@ -2390,7 +2396,7 @@
 %endif ; !HIGH_BIT_DEPTH
     VAR_CORE
     dec r2d
-    jg .loop
+    jg .loop%3
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -2400,13 +2406,13 @@
 cglobal pixel_var_16x16, 2,3
     FIX_STRIDES r1
     VAR_START 0
-    VAR_2ROW 8*SIZEOF_PIXEL, 16
+    VAR_2ROW 8*SIZEOF_PIXEL, 16, 1
     VAR_END 16, 16
 
 cglobal pixel_var_8x8, 2,3
     FIX_STRIDES r1
     VAR_START 0
-    VAR_2ROW r1, 4
+    VAR_2ROW r1, 4, 1
     VAR_END 8, 8
 
 %if HIGH_BIT_DEPTH
@@ -2414,24 +2420,130 @@
 cglobal pixel_var_16x16, 2,3,8
     FIX_STRIDES r1
     VAR_START 0
-    VAR_2ROW r1, 8
+    VAR_2ROW r1, 8, 1
     VAR_END 16, 16
 
 cglobal pixel_var_8x8, 2,3,8
     lea       r2, [r1*3]
     VAR_START 0
-    mova      m0, [r0]
-    mova      m1, [r0+r1*2]
-    mova      m3, [r0+r1*4]
-    mova      m4, [r0+r2*2]
+    movu      m0, [r0]
+    movu      m1, [r0+r1*2]
+    movu      m3, [r0+r1*4]
+    movu      m4, [r0+r2*2]
     lea       r0, [r0+r1*8]
     VAR_CORE
-    mova      m0, [r0]
-    mova      m1, [r0+r1*2]
-    mova      m3, [r0+r1*4]
-    mova      m4, [r0+r2*2]
+    movu      m0, [r0]
+    movu      m1, [r0+r1*2]
+    movu      m3, [r0+r1*4]
+    movu      m4, [r0+r2*2]
     VAR_CORE
-    VAR_END 8, 8
+    VAR_END 8, 8
+
+cglobal pixel_var_32x32, 2,6,8
+    FIX_STRIDES r1
+    mov       r3,    r0
+    VAR_START 0
+    VAR_2ROW  r1,    8, 1
+    HADDW      m5,    m2
+    movd       r4d,   m5
+    pxor       m5,    m5
+    VAR_2ROW  r1,    8, 2
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    lea       r0,    [r3 + 32]
+    VAR_2ROW  r1,    8, 3
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    VAR_2ROW  r1,    8, 4
+    VAR_END   32,    32
+
+cglobal pixel_var_64x64, 2,6,8
+    FIX_STRIDES r1
+    mov       r3,    r0
+    VAR_START 0
+    VAR_2ROW  r1,    8, 1
+    HADDW      m5,    m2
+    movd       r4d,   m5
+    pxor       m5,    m5
+    VAR_2ROW  r1,    8, 2
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    VAR_2ROW  r1,    8, 3
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    VAR_2ROW  r1,    8, 4
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    lea       r0,    [r3 + 32]
+    VAR_2ROW  r1,    8, 5
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    VAR_2ROW  r1,    8, 6
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    VAR_2ROW  r1,    8, 7
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    VAR_2ROW  r1,    8, 8
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    lea       r0,    [r3 + 64]
+    VAR_2ROW  r1,    8, 9
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    VAR_2ROW  r1,    8, 10
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    VAR_2ROW  r1,    8, 11
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    VAR_2ROW  r1,    8, 12
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    lea       r0,    [r3 + 96]
+    VAR_2ROW  r1,    8, 13
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    VAR_2ROW  r1,    8, 14
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    VAR_2ROW  r1,    8, 15
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    VAR_2ROW  r1,    8, 16
+    VAR_END   64,    64
 %endmacro ; VAR
 
 INIT_XMM sse2


More information about the x265-devel mailing list