[x265] [PATCH] asm: code for pixel_var_32x32 and 64x64 blocks

murugan at multicorewareinc.com murugan at multicorewareinc.com
Wed Nov 27 10:49:49 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1385545753 -19800
#      Wed Nov 27 15:19:13 2013 +0530
# Branch stable
# Node ID d770e8e65dc41c224cdea78efd588c5b2155c606
# Parent  417f794274e5692851b558eaa609e6fbdac1d50f
asm: code for pixel_var_32x32 and 64x64 blocks

diff -r 417f794274e5 -r d770e8e65dc4 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Wed Nov 27 01:49:09 2013 -0600
+++ b/source/common/pixel.cpp	Wed Nov 27 15:19:13 2013 +0530
@@ -985,6 +985,8 @@
 
     p.var[BLOCK_8x8] = pixel_var<8>;
     p.var[BLOCK_16x16] = pixel_var<16>;
+    p.var[BLOCK_32x32] = pixel_var<32>;
+    p.var[BLOCK_64x64] = pixel_var<64>;
     p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
 }
 }
diff -r 417f794274e5 -r d770e8e65dc4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Nov 27 01:49:09 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp	Wed Nov 27 15:19:13 2013 +0530
@@ -440,7 +440,9 @@
 
 #define LUMA_VAR(cpu) \
     SETUP_PIXEL_VAR_DEF(8,   8, cpu); \
-    SETUP_PIXEL_VAR_DEF(16, 16, cpu);
+    SETUP_PIXEL_VAR_DEF(16, 16, cpu); \
+    SETUP_PIXEL_VAR_DEF(32, 32, cpu); \
+    SETUP_PIXEL_VAR_DEF(64, 64, cpu);
 
 namespace x265 {
 // private x265 namespace
diff -r 417f794274e5 -r d770e8e65dc4 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Wed Nov 27 01:49:09 2013 -0600
+++ b/source/common/x86/pixel-a.asm	Wed Nov 27 15:19:13 2013 +0530
@@ -1612,7 +1612,13 @@
     HADDW   m5, m2
 %endif
 %else ; !HIGH_BIT_DEPTH
+%if %1 == 64
+    HADDW     m5,    m2
+    movd      m7,    r4d
+    paddd     m5,    m7
+%else
     HADDW   m5, m2
+%endif
 %endif ; HIGH_BIT_DEPTH
     HADDD   m6, m1
 %if ARCH_X86_64
@@ -1738,9 +1744,7 @@
     VAR_CORE
     VAR_END 8, 8
 
-cglobal pixel_var_16x16, 2,3,8
-    VAR_START 1
-    lea       r2,    [r1 * 3]
+cglobal pixel_var_16x16_internal
     mova      m0,    [r0]
     mova      m3,    [r0 + r1]
     DEINTB    1, 0, 4, 3, 7
@@ -1776,7 +1780,74 @@
     mova      m3,    [r0 + r2]
     DEINTB    1, 0, 4, 3, 7
     VAR_CORE
+    ret
+
+cglobal pixel_var_16x16, 2,3,8
+    VAR_START 1
+    lea     r2,    [r1 * 3]
+    call    pixel_var_16x16_internal
     VAR_END 16, 16
+
+cglobal pixel_var_32x32, 2,4,8
+    VAR_START 1
+    lea     r2,    [r1 * 3]
+    mov     r3,    r0
+    call    pixel_var_16x16_internal
+    lea       r0,    [r0 + r1 * 4]
+    call    pixel_var_16x16_internal
+    lea       r0,    [r3 + 16]
+    call    pixel_var_16x16_internal
+    lea       r0,    [r0 + r1 * 4]
+    call    pixel_var_16x16_internal
+    VAR_END 32, 32
+
+cglobal pixel_var_64x64, 2,6,8
+    VAR_START 1
+    lea     r2,    [r1 * 3]
+    mov     r3,    r0
+    call    pixel_var_16x16_internal
+    lea       r0,    [r0 + r1 * 4]
+    call    pixel_var_16x16_internal
+    lea       r0,    [r0 + r1 * 4]
+    call    pixel_var_16x16_internal
+    lea       r0,    [r0 + r1 * 4]
+    call    pixel_var_16x16_internal
+    HADDW     m5,    m2
+    movd      r4d,   m5
+    pxor      m5,    m5
+    lea       r0,    [r3 + 16]
+    call    pixel_var_16x16_internal
+    lea       r0,    [r0 + r1 * 4]
+    call    pixel_var_16x16_internal
+    lea       r0,    [r0 + r1 * 4]
+    call    pixel_var_16x16_internal
+    lea       r0,    [r0 + r1 * 4]
+    call    pixel_var_16x16_internal
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    lea       r0,    [r3 + 32]
+    call    pixel_var_16x16_internal
+    lea       r0,    [r0 + r1 * 4]
+    call    pixel_var_16x16_internal
+    lea       r0,    [r0 + r1 * 4]
+    call    pixel_var_16x16_internal
+    lea       r0,    [r0 + r1 * 4]
+    call    pixel_var_16x16_internal
+    lea       r0,    [r3 + 48]
+    HADDW     m5,    m2
+    movd      r5d,   m5
+    add       r4,    r5
+    pxor      m5,    m5
+    call    pixel_var_16x16_internal
+    lea       r0,    [r0 + r1 * 4]
+    call    pixel_var_16x16_internal
+    lea       r0,    [r0 + r1 * 4]
+    call    pixel_var_16x16_internal
+    lea       r0,    [r0 + r1 * 4]
+    call    pixel_var_16x16_internal
+    VAR_END 64, 64
 %endmacro ; VAR
 
 INIT_XMM sse2
diff -r 417f794274e5 -r d770e8e65dc4 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Wed Nov 27 01:49:09 2013 -0600
+++ b/source/common/x86/pixel.h	Wed Nov 27 15:19:13 2013 +0530
@@ -357,7 +357,9 @@
 
 #define LUMA_PIXELVAR_DEF(cpu) \
     SETUP_LUMA_PIXELVAR_FUNC(8,   8, cpu); \
-    SETUP_LUMA_PIXELVAR_FUNC(16, 16, cpu);
+    SETUP_LUMA_PIXELVAR_FUNC(16, 16, cpu); \
+    SETUP_LUMA_PIXELVAR_FUNC(32, 32, cpu); \
+    SETUP_LUMA_PIXELVAR_FUNC(64, 64, cpu);
 
 LUMA_PIXELVAR_DEF(_sse2);
 


More information about the x265-devel mailing list