[x265] [PATCH Review only] asm: code for pixel_var_8xN
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Mon Nov 25 14:38:00 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1385386658 -19800
# Mon Nov 25 19:07:38 2013 +0530
# Node ID deb2fc2dcaf24a86132ebfe0fbaac4859611c92f
# Parent 43da6ca15a61e18d033931ca58940d6794f6f8f8
asm: code for pixel_var_8xN
diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/pixel.cpp
--- a/source/common/pixel.cpp Mon Nov 25 18:46:28 2013 +0530
+++ b/source/common/pixel.cpp Mon Nov 25 19:07:38 2013 +0530
@@ -968,8 +968,11 @@
p.ssim_4x4x2_core = ssim_4x4x2_core;
p.ssim_end_4 = ssim_end_4;
- p.var[LUMA_16x16] = pixel_var<16, 16>;
+ p.var[LUMA_8x4] = pixel_var<8, 4>;
p.var[LUMA_8x8] = pixel_var<8, 8>;
+ p.var[LUMA_8x16] = pixel_var<8, 16>;
+ p.var[LUMA_8x32] = pixel_var<8, 32>;
+
p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
}
}
diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Nov 25 18:46:28 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Nov 25 19:07:38 2013 +0530
@@ -412,6 +412,15 @@
SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 16, cpu); \
SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 64, cpu);
+#define SETUP_PIXEL_VAR_DEF(W, H, cpu) \
+ p.var[LUMA_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;
+
+#define LUMA_VAR(cpu) \
+ SETUP_PIXEL_VAR_DEF(8, 4, cpu); \
+ SETUP_PIXEL_VAR_DEF(8, 8, cpu); \
+ SETUP_PIXEL_VAR_DEF(8, 16, cpu); \
+ SETUP_PIXEL_VAR_DEF(8, 32, cpu);
+
namespace x265 {
// private x265 namespace
@@ -442,6 +451,8 @@
PIXEL_AVG(sse2);
PIXEL_AVG_W4(mmx2);
+ LUMA_VAR(_sse2);
+
p.sad[LUMA_8x32] = x265_pixel_sad_8x32_sse2;
p.sad[LUMA_16x4] = x265_pixel_sad_16x4_sse2;
p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2;
diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Nov 25 18:46:28 2013 +0530
+++ b/source/common/x86/pixel-a.asm Mon Nov 25 19:07:38 2013 +0530
@@ -1301,6 +1301,106 @@
%if HIGH_BIT_DEPTH == 0
%macro VAR 0
+cglobal pixel_var_8x4, 2,3,8
+ VAR_START 1
+ lea r2, [r1 * 3]
+ movh m0, [r0]
+ movh m3, [r0 + r1]
+ movhps m0, [r0 + r1 * 2]
+ movhps m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ VAR_END 8, 4
+
+cglobal pixel_var_8x8, 2,3,8
+ VAR_START 1
+ lea r2, [r1 * 3]
+ movh m0, [r0]
+ movh m3, [r0 + r1]
+ movhps m0, [r0 + r1 * 2]
+ movhps m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ movh m0, [r0]
+ movh m3, [r0 + r1]
+ movhps m0, [r0 + r1 * 2]
+ movhps m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ VAR_END 8, 8
+
+
+cglobal pixel_var_8x16, 2,4,8
+ VAR_START 1
+ lea r2, [r1 * 3]
+ movh m0, [r0]
+ movh m3, [r0 + r1]
+ movhps m0, [r0 + r1 * 2]
+ movhps m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ movh m0, [r0]
+ movh m3, [r0 + r1]
+ movhps m0, [r0 + r1 * 2]
+ movhps m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ movh m0, [r0]
+ movh m3, [r0 + r1]
+ movhps m0, [r0 + r1 * 2]
+ movhps m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ movh m0, [r0]
+ movh m3, [r0 + r1]
+ movhps m0, [r0 + r1 * 2]
+ movhps m3, [r0 + r2]
+ DEINTB 1, 0, 4, 3, 7
+ VAR_CORE
+ VAR_END 8, 16
+
+cglobal pixel_var_8x32, 2,4,8
+ VAR_START 1
+ mov r2d, 2
+ lea r3, [r1 * 3]
+.loop:
+ movh m0, [r0]
+ movh m3, [r0 + r1]
+ movhps m0, [r0 + r1 * 2]
+ movhps m3, [r0 + r3]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ movh m0, [r0]
+ movh m3, [r0 + r1]
+ movhps m0, [r0 + r1 * 2]
+ movhps m3, [r0 + r3]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ movh m0, [r0]
+ movh m3, [r0 + r1]
+ movhps m0, [r0 + r1 * 2]
+ movhps m3, [r0 + r3]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ movh m0, [r0]
+ movh m3, [r0 + r1]
+ movhps m0, [r0 + r1 * 2]
+ movhps m3, [r0 + r3]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0 + r1 * 4]
+ VAR_CORE
+ dec r2d
+ jnz .loop
+ VAR_END 8, 32
+
cglobal pixel_var_16x16, 2,3,8
VAR_START 1
mov r2d, 8
@@ -1313,38 +1413,6 @@
dec r2d
jg .loop
VAR_END 16, 16
-
-cglobal pixel_var_8x8, 2,4,8
- VAR_START 1
- mov r2d, 2
- lea r3, [r1*3]
-.loop:
- movh m0, [r0]
- movh m3, [r0+r1]
- movhps m0, [r0+r1*2]
- movhps m3, [r0+r3]
- DEINTB 1, 0, 4, 3, 7
- lea r0, [r0+r1*4]
- VAR_CORE
- dec r2d
- jg .loop
- VAR_END 8, 8
-
-cglobal pixel_var_8x16, 2,4,8
- VAR_START 1
- mov r2d, 4
- lea r3, [r1*3]
-.loop:
- movh m0, [r0]
- movh m3, [r0+r1]
- movhps m0, [r0+r1*2]
- movhps m3, [r0+r3]
- DEINTB 1, 0, 4, 3, 7
- lea r0, [r0+r1*4]
- VAR_CORE
- dec r2d
- jg .loop
- VAR_END 8, 16
%endmacro ; VAR
INIT_XMM sse2
diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Nov 25 18:46:28 2013 +0530
+++ b/source/common/x86/pixel.h Mon Nov 25 19:07:38 2013 +0530
@@ -347,6 +347,17 @@
CHROMA_PIXELSUB_DEF(_sse4);
LUMA_PIXELSUB_DEF(_sse4);
+#define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \
+ uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(pixel *pix, intptr_t pixstride);
+
+#define LUMA_PIXELVAR_DEF(cpu) \
+ SETUP_LUMA_PIXELVAR_FUNC(8, 4, cpu); \
+ SETUP_LUMA_PIXELVAR_FUNC(8, 8, cpu); \
+ SETUP_LUMA_PIXELVAR_FUNC(8, 16, cpu); \
+ SETUP_LUMA_PIXELVAR_FUNC(8, 32, cpu);
+
+LUMA_PIXELVAR_DEF(_sse2);
+
#undef DECL_PIXELS
#undef DECL_SUF
#undef DECL_HEVC_SSD
@@ -357,6 +368,8 @@
#undef SETUP_LUMA_PIXELSUB_PS_FUNC
#undef CHROMA_PIXELSUB_DEF
#undef LUMA_PIXELSUB_DEF
+#undef LUMA_PIXELVAR_DEF
+#undef SETUP_LUMA_PIXELVAR_FUNC
void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
More information about the x265-devel
mailing list