[x265-commits] [x265] asm: pixelsub_ps routine for all block sizes
Murugan Vairavel
murugan at multicorewareinc.com
Wed Nov 13 06:48:22 CET 2013
details: http://hg.videolan.org/x265/rev/2d6dd46dc286
branches:
changeset: 5068:2d6dd46dc286
user: Murugan Vairavel <murugan at multicorewareinc.com>
date: Wed Nov 13 11:06:26 2013 +0530
description:
asm: pixelsub_ps routine for all block sizes
Subject: [x265] asm: Unit test code for pixelsub_ps function
details: http://hg.videolan.org/x265/rev/c4ca80d19105
branches:
changeset: 5069:c4ca80d19105
user: Murugan Vairavel <murugan at multicorewareinc.com>
date: Tue Nov 12 19:10:23 2013 +0530
description:
asm: Unit test code for pixelsub_ps function
diffstat:
source/common/pixel.cpp | 22 +-
source/common/primitives.h | 5 +
source/common/x86/asm-primitives.cpp | 6 +-
source/common/x86/pixel-a.asm | 1204 ++++++++++++++++++++++++++++++++++
source/common/x86/pixel.h | 66 +
source/test/pixelharness.cpp | 53 +
source/test/pixelharness.h | 2 +
7 files changed, 1354 insertions(+), 4 deletions(-)
diffs (truncated from 1469 to 300 lines):
diff -r 90c2763ee027 -r c4ca80d19105 source/common/pixel.cpp
--- a/source/common/pixel.cpp Tue Nov 12 16:55:09 2013 +0530
+++ b/source/common/pixel.cpp Tue Nov 12 19:10:23 2013 +0530
@@ -778,6 +778,22 @@ void blockcopy_ps_c(int16_t *a, intptr_t
b += strideb;
}
}
+
+template<int bx, int by>
+void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)
+{
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x++)
+ {
+ a[x] = (int16_t)(b0[x] - b1[x]);
+ }
+
+ b0 += sstride0;
+ b1 += sstride1;
+ a += dstride;
+ }
+}
} // end anonymous namespace
namespace x265 {
@@ -821,12 +837,14 @@ void Setup_C_PixelPrimitives(EncoderPrim
#define CHROMA(W, H) \
p.chroma_copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
p.chroma_copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
- p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;
+ p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;\
+ p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;
#define LUMA(W, H) \
p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
- p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;
+ p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;\
+ p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;
LUMA(4, 4);
LUMA(8, 8);
diff -r 90c2763ee027 -r c4ca80d19105 source/common/primitives.h
--- a/source/common/primitives.h Tue Nov 12 16:55:09 2013 +0530
+++ b/source/common/primitives.h Tue Nov 12 19:10:23 2013 +0530
@@ -206,6 +206,8 @@ typedef void (*copy_pp_t)(pixel *dst, in
typedef void (*copy_sp_t)(pixel *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride);
typedef void (*copy_ps_t)(int16_t *dst, intptr_t dstStride, pixel *src, intptr_t srcStride);
+typedef void (*pixel_sub_ps_t)(int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
+
/* Define a structure containing function pointers to optimized encoder
* primitives. Each pointer can reference either an assembly routine,
* a vectorized primitive, or a C function. */
@@ -236,6 +238,9 @@ struct EncoderPrimitives
copy_ps_t luma_copy_ps[NUM_LUMA_PARTITIONS];
copy_ps_t chroma_copy_ps[NUM_CHROMA_PARTITIONS];
+ pixel_sub_ps_t luma_sub_ps[NUM_LUMA_PARTITIONS];
+ pixel_sub_ps_t chroma_sub_ps[NUM_CHROMA_PARTITIONS];
+
ipfilter_ps_t ipfilter_ps[NUM_IPFILTER_P_S];
ipfilter_sp_t ipfilter_sp[NUM_IPFILTER_S_P];
ipfilter_ss_t ipfilter_ss[NUM_IPFILTER_S_S];
diff -r 90c2763ee027 -r c4ca80d19105 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Nov 12 16:55:09 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Nov 12 19:10:23 2013 +0530
@@ -138,7 +138,8 @@ extern "C" {
#define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \
p.chroma_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
p.chroma_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
- p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu;
+ p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu; \
+ p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu;
#define SETUP_CHROMA_SP_FUNC_DEF(W, H, cpu) \
p.chroma_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
@@ -229,7 +230,8 @@ extern "C" {
p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
- p.luma_copy_ps[LUMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu;
+ p.luma_copy_ps[LUMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu; \
+ p.luma_sub_ps[LUMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu;
#define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \
p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
diff -r 90c2763ee027 -r c4ca80d19105 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Nov 12 16:55:09 2013 +0530
+++ b/source/common/x86/pixel-a.asm Tue Nov 12 19:10:23 2013 +0530
@@ -5580,3 +5580,1207 @@ ads_mvs_ssse3:
jl .loop
movifnidn eax, r0d
RET
+
+;-----------------------------------------------------------------------------
+; void pixel_sub_ps_c_2x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+%if ARCH_X86_64
+ cglobal pixel_sub_ps_2x4, 6, 8, 0
+
+ %define tmp_r1 r1
+ DECLARE_REG_TMP 6, 7
+%else
+ cglobal pixel_sub_ps_2x4, 6, 7, 0, 0-4
+
+ %define tmp_r1 dword [rsp]
+ DECLARE_REG_TMP 6, 1
+%endif ; ARCH_X86_64
+
+ add r1, r1
+
+%if ARCH_X86_64 == 0
+ mov tmp_r1, r1
+
+%endif
+
+movzx t0d, byte [r2]
+movzx t1d, byte [r3]
+
+sub t0d, t1d
+
+mov [r0], t0w
+
+movzx t0d, byte [r2 + 1]
+movzx t1d, byte [r3 + 1]
+
+sub t0d, t1d
+
+mov [r0 + 2], t0w
+
+add r0, tmp_r1
+
+movzx t0d, byte [r2 + r4]
+movzx t1d, byte [r3 + r5]
+
+sub t0d, t1d
+
+mov [r0], t0w
+
+movzx t0d, byte [r2 + r4 + 1]
+movzx t1d, byte [r3 + r5 + 1]
+
+sub t0d, t1d
+
+mov [r0 + 2], t0w
+
+add r0, tmp_r1
+
+movzx t0d, byte [r2 + r4 * 2]
+movzx t1d, byte [r3 + r5 * 2]
+
+sub t0d, t1d
+
+mov [r0], t0w
+
+movzx t0d, byte [r2 + r4 * 2 + 1]
+movzx t1d, byte [r3 + r5 * 2 + 1]
+
+sub t0d, t1d
+
+mov [r0 + 2], t0w
+
+add r0, tmp_r1
+
+lea r2, [r2 + r4 * 2]
+lea r3, [r3 + r5 * 2]
+
+movzx t0d, byte [r2 + r4]
+movzx t1d, byte [r3 + r5]
+
+sub t0d, t1d
+
+mov [r0], t0w
+
+movzx t0d, byte [r2 + r4 + 1]
+movzx t1d, byte [r3 + r5 + 1]
+
+sub t0d, t1d
+
+mov [r0 + 2], t0w
+
+RET
+
+;-----------------------------------------------------------------------------
+; void pixel_sub_ps_c_2x8(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+%if ARCH_X86_64
+ cglobal pixel_sub_ps_2x8, 6, 8, 0
+
+ %define tmp_r1 r1
+ DECLARE_REG_TMP 6, 7
+%else
+ cglobal pixel_sub_ps_2x8, 6, 7, 0, 0-4
+
+ %define tmp_r1 dword [rsp]
+ DECLARE_REG_TMP 6, 1
+%endif ; ARCH_X86_64
+
+ add r1, r1
+
+%if ARCH_X86_64 == 0
+ mov tmp_r1, r1
+
+%endif
+
+ movzx t0d, byte [r2]
+ movzx t1d, byte [r3]
+
+ sub t0d, t1d
+
+ mov [r0], t0w
+ movzx t0d, byte [r2 + 1]
+ movzx t1d, byte [r3 + 1]
+
+ sub t0d, t1d
+
+ mov [r0 + 2], t0w
+
+ add r0, tmp_r1
+
+ movzx t0d, byte [r2 + r4]
+ movzx t1d, byte [r3 + r5]
+
+ sub t0d, t1d
+
+ mov [r0], t0w
+ movzx t0d, byte [r2 + r4 + 1]
+ movzx t1d, byte [r3 + r5 + 1]
+
+ sub t0d, t1d
+
+ mov [r0 + 2], t0w
+
+ add r0, tmp_r1
+
+ movzx t0d, byte [r2 + r4 * 2]
+ movzx t1d, byte [r3 + r5 * 2]
+
+ sub t0d, t1d
+
+ mov [r0], t0w
+ movzx t0d, byte [r2 + r4 * 2 + 1]
+ movzx t1d, byte [r3 + r5 * 2 + 1]
+
+ sub t0d, t1d
+
+ mov [r0 + 2], t0w
+
+ add r0, tmp_r1
+
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+
+ movzx t0d, byte [r2 + r4]
+ movzx t1d, byte [r3 + r5]
+
+ sub t0d, t1d
+
+ mov [r0], t0w
+ movzx t0d, byte [r2 + r4 + 1]
+ movzx t1d, byte [r3 + r5 + 1]
+
+ sub t0d, t1d
+
+ mov [r0 + 2], t0w
+
+ add r0, tmp_r1
+
+ movzx t0d, byte [r2 + r4 * 2]
+ movzx t1d, byte [r3 + r5 * 2]
+
+ sub t0d, t1d
+
+ mov [r0], t0w
+ movzx t0d, byte [r2 + r4 * 2 + 1]
+ movzx t1d, byte [r3 + r5 * 2 + 1]
+
+ sub t0d, t1d
+
+ mov [r0 + 2], t0w
+
+ add r0, tmp_r1
+
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+
+ movzx t0d, byte [r2 + r4]
+ movzx t1d, byte [r3 + r5]
+
+ sub t0d, t1d
+
+ mov [r0], t0w
+ movzx t0d, byte [r2 + r4 + 1]
+ movzx t1d, byte [r3 + r5 + 1]
+
+ sub t0d, t1d
More information about the x265-commits
mailing list