[x265] [PATCH Review only] asm: code for pixel_var_16xN
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Mon Nov 25 14:58:48 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1385387913 -19800
# Mon Nov 25 19:28:33 2013 +0530
# Node ID 9e9767a887e3a91c0953b9bfa17c2f34f03ecf11
# Parent deb2fc2dcaf24a86132ebfe0fbaac4859611c92f
asm: code for pixel_var_16xN
diff -r deb2fc2dcaf2 -r 9e9767a887e3 source/common/pixel.cpp
--- a/source/common/pixel.cpp Mon Nov 25 19:07:38 2013 +0530
+++ b/source/common/pixel.cpp Mon Nov 25 19:28:33 2013 +0530
@@ -972,6 +972,12 @@
p.var[LUMA_8x8] = pixel_var<8, 8>;
p.var[LUMA_8x16] = pixel_var<8, 16>;
p.var[LUMA_8x32] = pixel_var<8, 32>;
+ p.var[LUMA_16x4] = pixel_var<16, 4>;
+ p.var[LUMA_16x8] = pixel_var<16, 8>;
+ p.var[LUMA_16x12] = pixel_var<16, 12>;
+ p.var[LUMA_16x16] = pixel_var<16, 16>;
+ p.var[LUMA_16x32] = pixel_var<16, 32>;
+ p.var[LUMA_16x64] = pixel_var<16, 64>;
p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
}
diff -r deb2fc2dcaf2 -r 9e9767a887e3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Nov 25 19:07:38 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Nov 25 19:28:33 2013 +0530
@@ -419,7 +419,13 @@
SETUP_PIXEL_VAR_DEF(8, 4, cpu); \
SETUP_PIXEL_VAR_DEF(8, 8, cpu); \
SETUP_PIXEL_VAR_DEF(8, 16, cpu); \
- SETUP_PIXEL_VAR_DEF(8, 32, cpu);
+ SETUP_PIXEL_VAR_DEF(8, 32, cpu); \
+ SETUP_PIXEL_VAR_DEF(16, 4, cpu); \
+ SETUP_PIXEL_VAR_DEF(16, 8, cpu); \
+ SETUP_PIXEL_VAR_DEF(16, 12, cpu); \
+ SETUP_PIXEL_VAR_DEF(16, 16, cpu); \
+ SETUP_PIXEL_VAR_DEF(16, 32, cpu); \
+ SETUP_PIXEL_VAR_DEF(16, 64, cpu);
namespace x265 {
// private x265 namespace
diff -r deb2fc2dcaf2 -r 9e9767a887e3 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Nov 25 19:07:38 2013 +0530
+++ b/source/common/x86/pixel-a.asm Mon Nov 25 19:28:33 2013 +0530
@@ -1401,18 +1401,201 @@
jnz .loop
VAR_END 8, 32
+cglobal pixel_var_16x4, 2,3,8
+ VAR_START 1
+ lea r2, [r1 * 3]
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ VAR_END 16, 4
+
+cglobal pixel_var_16x8, 2,3,8
+ VAR_START 1
+ lea r2, [r1 * 3]
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ VAR_END 16, 8
+
+cglobal pixel_var_16x12, 2,3,8
+ VAR_START 1
+ lea r2, [r1 * 3]
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ VAR_END 16, 12
+
cglobal pixel_var_16x16, 2,3,8
VAR_START 1
- mov r2d, 8
+ lea r2, [r1 * 3]
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ VAR_END 16, 16
+
+cglobal pixel_var_16x32, 2,4,8
+ VAR_START 1
+ mov r2d, 2
+ lea r3, [r1 * 3]
.loop:
- mova m0, [r0]
- mova m3, [r0+r1]
+ mova m0, [r0]
+ mova m3, [r0 + r1]
DEINTB 1, 0, 4, 3, 7
- lea r0, [r0+r1*2]
VAR_CORE
- dec r2d
- jg .loop
- VAR_END 16, 16
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r3]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r3]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r3]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r3]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ dec r2d
+ jg .loop
+ VAR_END 16, 32
+
+cglobal pixel_var_16x64, 2,4,8
+ VAR_START 1
+ mov r2d, 4
+ lea r3, [r1 * 3]
+.loop:
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r3]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r3]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r3]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ mova m0, [r0]
+ mova m3, [r0 + r1]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ mova m0, [r0 + 2 * r1]
+ mova m3, [r0 + r3]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ dec r2d
+ jg .loop
+ VAR_END 16, 64
%endmacro ; VAR
INIT_XMM sse2
diff -r deb2fc2dcaf2 -r 9e9767a887e3 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Nov 25 19:07:38 2013 +0530
+++ b/source/common/x86/pixel.h Mon Nov 25 19:28:33 2013 +0530
@@ -354,7 +354,13 @@
SETUP_LUMA_PIXELVAR_FUNC(8, 4, cpu); \
SETUP_LUMA_PIXELVAR_FUNC(8, 8, cpu); \
SETUP_LUMA_PIXELVAR_FUNC(8, 16, cpu); \
- SETUP_LUMA_PIXELVAR_FUNC(8, 32, cpu);
+ SETUP_LUMA_PIXELVAR_FUNC(8, 32, cpu); \
+ SETUP_LUMA_PIXELVAR_FUNC(16, 4, cpu); \
+ SETUP_LUMA_PIXELVAR_FUNC(16, 8, cpu); \
+ SETUP_LUMA_PIXELVAR_FUNC(16, 12, cpu); \
+ SETUP_LUMA_PIXELVAR_FUNC(16, 16, cpu); \
+ SETUP_LUMA_PIXELVAR_FUNC(16, 32, cpu); \
+ SETUP_LUMA_PIXELVAR_FUNC(16, 48, cpu);
LUMA_PIXELVAR_DEF(_sse2);
More information about the x265-devel
mailing list