[x265-commits] [x265] asm: pixelsub_ps routine for all block sizes

Murugan Vairavel murugan at multicorewareinc.com
Wed Nov 13 06:48:22 CET 2013


details:   http://hg.videolan.org/x265/rev/2d6dd46dc286
branches:  
changeset: 5068:2d6dd46dc286
user:      Murugan Vairavel <murugan at multicorewareinc.com>
date:      Wed Nov 13 11:06:26 2013 +0530
description:
asm: pixelsub_ps routine for all block sizes
Subject: [x265] asm: Unit test code for pixelsub_ps function

details:   http://hg.videolan.org/x265/rev/c4ca80d19105
branches:  
changeset: 5069:c4ca80d19105
user:      Murugan Vairavel <murugan at multicorewareinc.com>
date:      Tue Nov 12 19:10:23 2013 +0530
description:
asm: Unit test code for pixelsub_ps function

diffstat:

 source/common/pixel.cpp              |    22 +-
 source/common/primitives.h           |     5 +
 source/common/x86/asm-primitives.cpp |     6 +-
 source/common/x86/pixel-a.asm        |  1204 ++++++++++++++++++++++++++++++++++
 source/common/x86/pixel.h            |    66 +
 source/test/pixelharness.cpp         |    53 +
 source/test/pixelharness.h           |     2 +
 7 files changed, 1354 insertions(+), 4 deletions(-)

diffs (truncated from 1469 to 300 lines):

diff -r 90c2763ee027 -r c4ca80d19105 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Tue Nov 12 16:55:09 2013 +0530
+++ b/source/common/pixel.cpp	Tue Nov 12 19:10:23 2013 +0530
@@ -778,6 +778,22 @@ void blockcopy_ps_c(int16_t *a, intptr_t
         b += strideb;
     }
 }
+
+template<int bx, int by>
+void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)
+{
+    for (int y = 0; y < by; y++)
+    {
+        for (int x = 0; x < bx; x++)
+        {
+            a[x] = (int16_t)(b0[x] - b1[x]);
+        }
+
+        b0 += sstride0;
+        b1 += sstride1;
+        a += dstride;
+    }
+}
 }  // end anonymous namespace
 
 namespace x265 {
@@ -821,12 +837,14 @@ void Setup_C_PixelPrimitives(EncoderPrim
 #define CHROMA(W, H) \
     p.chroma_copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
     p.chroma_copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
-    p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;
+    p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;\
+    p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;
 
 #define LUMA(W, H) \
     p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
     p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
-    p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;
+    p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;\
+    p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;
 
     LUMA(4, 4);
     LUMA(8, 8);
diff -r 90c2763ee027 -r c4ca80d19105 source/common/primitives.h
--- a/source/common/primitives.h	Tue Nov 12 16:55:09 2013 +0530
+++ b/source/common/primitives.h	Tue Nov 12 19:10:23 2013 +0530
@@ -206,6 +206,8 @@ typedef void (*copy_pp_t)(pixel *dst, in
 typedef void (*copy_sp_t)(pixel *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride);
 typedef void (*copy_ps_t)(int16_t *dst, intptr_t dstStride, pixel *src, intptr_t srcStride);
 
+typedef void (*pixel_sub_ps_t)(int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
+
 /* Define a structure containing function pointers to optimized encoder
  * primitives.  Each pointer can reference either an assembly routine,
  * a vectorized primitive, or a C function. */
@@ -236,6 +238,9 @@ struct EncoderPrimitives
     copy_ps_t       luma_copy_ps[NUM_LUMA_PARTITIONS];
     copy_ps_t       chroma_copy_ps[NUM_CHROMA_PARTITIONS];
 
+    pixel_sub_ps_t  luma_sub_ps[NUM_LUMA_PARTITIONS];
+    pixel_sub_ps_t  chroma_sub_ps[NUM_CHROMA_PARTITIONS];
+
     ipfilter_ps_t   ipfilter_ps[NUM_IPFILTER_P_S];
     ipfilter_sp_t   ipfilter_sp[NUM_IPFILTER_S_P];
     ipfilter_ss_t   ipfilter_ss[NUM_IPFILTER_S_S];
diff -r 90c2763ee027 -r c4ca80d19105 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Nov 12 16:55:09 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Nov 12 19:10:23 2013 +0530
@@ -138,7 +138,8 @@ extern "C" {
 #define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \
     p.chroma_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
     p.chroma_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
-    p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu;
+    p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu; \
+    p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu;
 
 #define SETUP_CHROMA_SP_FUNC_DEF(W, H, cpu) \
     p.chroma_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
@@ -229,7 +230,8 @@ extern "C" {
     p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
     p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
     p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
-    p.luma_copy_ps[LUMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu;
+    p.luma_copy_ps[LUMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu; \
+    p.luma_sub_ps[LUMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu;
 
 #define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \
     p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
diff -r 90c2763ee027 -r c4ca80d19105 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Tue Nov 12 16:55:09 2013 +0530
+++ b/source/common/x86/pixel-a.asm	Tue Nov 12 19:10:23 2013 +0530
@@ -5580,3 +5580,1207 @@ ads_mvs_ssse3:
     jl .loop
     movifnidn eax, r0d
     RET
+
+;-----------------------------------------------------------------------------
+; void pixel_sub_ps_c_2x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+%if ARCH_X86_64
+    cglobal pixel_sub_ps_2x4, 6, 8, 0
+
+    %define tmp_r1     r1
+    DECLARE_REG_TMP    6, 7
+%else
+    cglobal pixel_sub_ps_2x4, 6, 7, 0, 0-4
+
+    %define tmp_r1     dword [rsp]
+    DECLARE_REG_TMP    6, 1
+%endif ; ARCH_X86_64
+
+    add    r1,         r1
+
+%if ARCH_X86_64 == 0
+    mov    tmp_r1,     r1
+
+%endif
+
+movzx    t0d,      byte [r2]
+movzx    t1d,      byte [r3]
+
+sub      t0d,      t1d
+
+mov      [r0],     t0w
+
+movzx    t0d,      byte [r2 + 1]
+movzx    t1d,      byte [r3 + 1]
+
+sub      t0d,      t1d
+
+mov      [r0 + 2], t0w
+
+add      r0,       tmp_r1
+
+movzx    t0d,      byte [r2 + r4]
+movzx    t1d,      byte [r3 + r5]
+
+sub      t0d,      t1d
+
+mov      [r0],     t0w
+
+movzx    t0d,      byte [r2 + r4 + 1]
+movzx    t1d,      byte [r3 + r5 + 1]
+
+sub      t0d,      t1d
+
+mov      [r0 + 2], t0w
+
+add      r0,       tmp_r1
+
+movzx    t0d,      byte [r2 + r4 * 2]
+movzx    t1d,      byte [r3 + r5 * 2]
+
+sub      t0d,      t1d
+
+mov      [r0],     t0w
+
+movzx    t0d,      byte [r2 + r4 * 2 + 1]
+movzx    t1d,      byte [r3 + r5 * 2 + 1]
+
+sub      t0d,      t1d
+
+mov      [r0 + 2], t0w
+
+add      r0,       tmp_r1
+
+lea      r2,       [r2 + r4 * 2]
+lea      r3,       [r3 + r5 * 2]
+
+movzx    t0d,      byte [r2 + r4]
+movzx    t1d,      byte [r3 + r5]
+
+sub      t0d,      t1d
+
+mov      [r0],     t0w
+
+movzx    t0d,      byte [r2 + r4 + 1]
+movzx    t1d,      byte [r3 + r5 + 1]
+
+sub      t0d,      t1d
+
+mov      [r0 + 2], t0w
+
+RET
+
+;-----------------------------------------------------------------------------
+; void pixel_sub_ps_c_2x8(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+%if ARCH_X86_64
+    cglobal pixel_sub_ps_2x8, 6, 8, 0
+
+    %define tmp_r1     r1
+    DECLARE_REG_TMP    6, 7
+%else
+    cglobal pixel_sub_ps_2x8, 6, 7, 0, 0-4
+
+    %define tmp_r1     dword [rsp]
+    DECLARE_REG_TMP    6, 1
+%endif ; ARCH_X86_64
+
+    add    r1,         r1
+
+%if ARCH_X86_64 == 0
+    mov    tmp_r1,     r1
+
+%endif
+
+    movzx    t0d,      byte [r2]
+    movzx    t1d,      byte [r3]
+
+    sub      t0d,      t1d
+
+    mov      [r0],     t0w
+    movzx    t0d,      byte [r2 + 1]
+    movzx    t1d,      byte [r3 + 1]
+
+    sub      t0d,      t1d
+
+    mov      [r0 + 2], t0w
+
+    add      r0,       tmp_r1
+
+    movzx    t0d,      byte [r2 + r4]
+    movzx    t1d,      byte [r3 + r5]
+
+    sub      t0d,      t1d
+
+    mov      [r0],     t0w
+    movzx    t0d,      byte [r2 + r4 + 1]
+    movzx    t1d,      byte [r3 + r5 + 1]
+
+    sub      t0d,      t1d
+
+    mov      [r0 + 2], t0w
+
+    add      r0,       tmp_r1
+
+    movzx    t0d,      byte [r2 + r4 * 2]
+    movzx    t1d,      byte [r3 + r5 * 2]
+
+    sub      t0d,      t1d
+
+    mov      [r0],     t0w
+    movzx    t0d,      byte [r2 + r4 * 2 + 1]
+    movzx    t1d,      byte [r3 + r5 * 2 + 1]
+
+    sub      t0d,      t1d
+
+    mov      [r0 + 2], t0w
+
+    add      r0,       tmp_r1
+
+    lea      r2,       [r2 + r4 * 2]
+    lea      r3,       [r3 + r5 * 2]
+
+    movzx    t0d,      byte [r2 + r4]
+    movzx    t1d,      byte [r3 + r5]
+
+    sub      t0d,      t1d
+
+    mov      [r0],     t0w
+    movzx    t0d,      byte [r2 + r4 + 1]
+    movzx    t1d,      byte [r3 + r5 + 1]
+
+    sub      t0d,      t1d
+
+    mov      [r0 + 2], t0w
+
+    add      r0,       tmp_r1
+
+    movzx    t0d,      byte [r2 + r4 * 2]
+    movzx    t1d,      byte [r3 + r5 * 2]
+
+    sub      t0d,      t1d
+
+    mov      [r0],     t0w
+    movzx    t0d,      byte [r2 + r4 * 2 + 1]
+    movzx    t1d,      byte [r3 + r5 * 2 + 1]
+
+    sub      t0d,      t1d
+
+    mov      [r0 + 2], t0w
+
+    add      r0,       tmp_r1
+
+    lea      r2,       [r2 + r4 * 2]
+    lea      r3,       [r3 + r5 * 2]
+
+    movzx    t0d,      byte [r2 + r4]
+    movzx    t1d,      byte [r3 + r5]
+
+    sub      t0d,      t1d
+
+    mov      [r0],     t0w
+    movzx    t0d,      byte [r2 + r4 + 1]
+    movzx    t1d,      byte [r3 + r5 + 1]
+
+    sub      t0d,      t1d


More information about the x265-commits mailing list