<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Tue, Nov 12, 2013 at 7:41 AM,  <span dir="ltr"><<a href="mailto:murugan@multicorewareinc.com" target="_blank">murugan@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Murugan Vairavel <<a href="mailto:murugan@multicorewareinc.com">murugan@multicorewareinc.com</a>><br>
# Date 1384263623 -19800<br>
#      Tue Nov 12 19:10:23 2013 +0530<br>
# Node ID b1e0fe97bbfa7bf367d7318f057690c64f1f1f19<br>
# Parent  7a8118d07276312b2971b292d689805074abd28a<br>
asm: Unit test code for pixelsub_ps function<br></blockquote><div><br></div><div>you need to address Min's comments for the asm patch</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

<br>
diff -r 7a8118d07276 -r b1e0fe97bbfa source/common/pixel.cpp<br>
--- a/source/common/pixel.cpp   Tue Nov 12 17:06:34 2013 +0530<br>
+++ b/source/common/pixel.cpp   Tue Nov 12 19:10:23 2013 +0530<br>
@@ -778,6 +778,22 @@<br>
         b += strideb;<br>
     }<br>
 }<br>
+<br>
+template<int bx, int by><br>
+void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)<br>
+{<br>
+    for (int y = 0; y < by; y++)<br>
+    {<br>
+        for (int x = 0; x < bx; x++)<br>
+        {<br>
+            a[x] = (int16_t)(b0[x] - b1[x]);<br>
+        }<br>
+<br>
+        b0 += sstride0;<br>
+        b1 += sstride1;<br>
+        a += dstride;<br>
+    }<br>
+}<br>
 }  // end anonymous namespace<br>
<br>
 namespace x265 {<br>
@@ -821,12 +837,14 @@<br>
 #define CHROMA(W, H) \<br>
     p.chroma_copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \<br>
     p.chroma_copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \<br>
-    p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;<br>
+    p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;\<br>
+    p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;<br>
<br>
 #define LUMA(W, H) \<br>
     p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \<br>
     p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \<br>
-    p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;<br>
+    p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;\<br>
+    p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;<br>
<br>
     LUMA(4, 4);<br>
     LUMA(8, 8);<br>
diff -r 7a8118d07276 -r b1e0fe97bbfa source/common/primitives.h<br>
--- a/source/common/primitives.h        Tue Nov 12 17:06:34 2013 +0530<br>
+++ b/source/common/primitives.h        Tue Nov 12 19:10:23 2013 +0530<br>
@@ -207,6 +207,8 @@<br>
 typedef void (*copy_sp_t)(pixel *dst, intptr_t dstStride, int16_t *src, intptr_t srcStride);<br>
 typedef void (*copy_ps_t)(int16_t *dst, intptr_t dstStride, pixel *src, intptr_t srcStride);<br>
<br>
+typedef void (*pixel_sub_ps_t)(int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);<br>
+<br>
 /* Define a structure containing function pointers to optimized encoder<br>
  * primitives.  Each pointer can reference either an assembly routine,<br>
  * a vectorized primitive, or a C function. */<br>
@@ -237,6 +239,9 @@<br>
     copy_ps_t       luma_copy_ps[NUM_LUMA_PARTITIONS];<br>
     copy_ps_t       chroma_copy_ps[NUM_CHROMA_PARTITIONS];<br>
<br>
+    pixel_sub_ps_t  luma_sub_ps[NUM_LUMA_PARTITIONS];<br>
+    pixel_sub_ps_t  chroma_sub_ps[NUM_CHROMA_PARTITIONS];<br>
+<br>
     ipfilter_ps_t   ipfilter_ps[NUM_IPFILTER_P_S];<br>
     ipfilter_sp_t   ipfilter_sp[NUM_IPFILTER_S_P];<br>
     ipfilter_ss_t   ipfilter_ss[NUM_IPFILTER_S_S];<br>
diff -r 7a8118d07276 -r b1e0fe97bbfa source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp      Tue Nov 12 17:06:34 2013 +0530<br>
+++ b/source/common/x86/asm-primitives.cpp      Tue Nov 12 19:10:23 2013 +0530<br>
@@ -133,7 +133,8 @@<br>
<br>
 #define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \<br>
     p.chroma_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \<br>
-    p.chroma_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu;<br>
+    p.chroma_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu;\<br>
+    p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu;<br>
<br>
 #define SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(W, H, cpu) \<br>
     p.chroma_copy_pp[CHROMA_ ## W ## x ## H] = x265_blockcopy_pp_ ## W ## x ## H ## cpu;<br>
@@ -194,7 +195,8 @@<br>
     p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \<br>
     p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \<br>
     p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \<br>
-    p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu;<br>
+    p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu;\<br>
+    p.luma_sub_ps[LUMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu;<br>
<br>
 #define SETUP_LUMA_BLOCKCOPY_FUNC_DEF(W, H, cpu) \<br>
     p.luma_copy_pp[LUMA_ ## W ## x ## H] = x265_blockcopy_pp_ ## W ## x ## H ## cpu;<br>
diff -r 7a8118d07276 -r b1e0fe97bbfa source/common/x86/pixel.h<br>
--- a/source/common/x86/pixel.h Tue Nov 12 17:06:34 2013 +0530<br>
+++ b/source/common/x86/pixel.h Tue Nov 12 19:10:23 2013 +0530<br>
@@ -266,11 +266,77 @@<br>
 DECL_ADS(2, avx2)<br>
 DECL_ADS(1, avx2)<br>
<br>
+#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \<br>
+    void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);<br>
+<br>
+#define CHROMA_PIXELSUB_DEF(cpu) \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 4, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 2, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(2, 4, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 8, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 4, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 8, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 6, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(6, 8, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 2, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(2, 8, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 16, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 8, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 16, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 12, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(12, 16, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 4, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(4, 16, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 32, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 16, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(16, 32, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 24, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(24, 32, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(32, 8, cpu); \<br>
+    SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 32, cpu);<br>
+<br>
+#define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \<br>
+    void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);<br>
+<br>
+#define LUMA_PIXELSUB_DEF(cpu) \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(4,   4, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(8,   8, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(8,   4, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(4,   8, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(16, 16, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(16,  8, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(8,  16, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(16, 12, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(12, 16, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(16,  4, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(4,  16, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(32, 32, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(32, 16, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(16, 32, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(32, 24, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(24, 32, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(32,  8, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(8,  32, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(64, 64, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(64, 32, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(32, 64, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(64, 48, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(48, 64, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(64, 16, cpu); \<br>
+    SETUP_LUMA_PIXELSUB_PS_FUNC(16, 64, cpu);<br>
+<br>
+CHROMA_PIXELSUB_DEF(_sse4);<br>
+LUMA_PIXELSUB_DEF(_sse4);<br>
+<br>
 #undef DECL_PIXELS<br>
 #undef DECL_SUF<br>
 #undef DECL_HEVC_SSD<br>
 #undef DECL_X1<br>
 #undef DECL_X4<br>
 #undef DECL_ADS<br>
+#undef SETUP_CHROMA_PIXELSUB_PS_FUNC<br>
+#undef SETUP_LUMA_PIXELSUB_PS_FUNC<br>
+#undef CHROMA_PIXELSUB_DEF<br>
+#undef LUMA_PIXELSUB_DEF<br>
<br>
 #endif // ifndef X265_I386_PIXEL_H<br>
diff -r 7a8118d07276 -r b1e0fe97bbfa source/test/pixelharness.cpp<br>
--- a/source/test/pixelharness.cpp      Tue Nov 12 17:06:34 2013 +0530<br>
+++ b/source/test/pixelharness.cpp      Tue Nov 12 19:10:23 2013 +0530<br>
@@ -586,6 +586,29 @@<br>
     return true;<br>
 }<br>
<br>
+bool PixelHarness::check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt)<br>
+{<br>
+    ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);<br>
+    ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);<br>
+<br>
+    memset(ref_dest, 0xCD, sizeof(ref_dest));<br>
+    memset(opt_dest, 0xCD, sizeof(opt_dest));<br>
+<br>
+    int j = 0;<br>
+    for (int i = 0; i < 1; i++)<br>
+    {<br>
+        opt(opt_dest, 64, pbuf2 + j, pbuf1 + j, STRIDE, STRIDE);<br>
+        ref(ref_dest, 64, pbuf2 + j, pbuf1 + j, STRIDE, STRIDE);<br>
+<br>
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))<br>
+            return false;<br>
+<br>
+        j += INCR;<br>
+    }<br>
+<br>
+    return true;<br>
+}<br>
+<br>
 bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)<br>
 {<br>
     if (opt.satd[part])<br>
@@ -722,6 +745,24 @@<br>
             return false;<br>
         }<br>
     }<br>
+<br>
+    if (opt.luma_sub_ps[part])<br>
+    {<br>
+        if (!check_pixel_sub_ps(ref.luma_sub_ps[part], opt.luma_sub_ps[part]))<br>
+        {<br>
+            printf("luma_sub_ps[%s] failed\n", lumaPartStr[part]);<br>
+            return false;<br>
+        }<br>
+    }<br>
+<br>
+    if (opt.chroma_sub_ps[part])<br>
+    {<br>
+        if (!check_pixel_sub_ps(ref.chroma_sub_ps[part], opt.chroma_sub_ps[part]))<br>
+        {<br>
+            printf("chroma_sub_ps[%s] failed\n", chromaPartStr[part]);<br>
+            return false;<br>
+        }<br>
+    }<br>
     return true;<br>
 }<br>
<br>
@@ -968,6 +1009,18 @@<br>
         printf("ccpy_ps[%s]", chromaPartStr[part]);<br>
         REPORT_SPEEDUP(opt.chroma_copy_ps[part], ref.chroma_copy_ps[part], sbuf1, 64, pbuf1, 128);<br>
     }<br>
+<br>
+    if (opt.luma_sub_ps[part])<br>
+    {<br>
+        printf("luma_sub_ps[%s]", lumaPartStr[part]);<br>
+        REPORT_SPEEDUP(opt.luma_sub_ps[part], ref.luma_sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);<br>
+    }<br>
+<br>
+    if (opt.chroma_sub_ps[part])<br>
+    {<br>
+        printf("chroma_sub_ps[%s]", chromaPartStr[part]);<br>
+        REPORT_SPEEDUP(opt.chroma_sub_ps[part], ref.chroma_sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);<br>
+    }<br>
 }<br>
<br>
 void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt)<br>
diff -r 7a8118d07276 -r b1e0fe97bbfa source/test/pixelharness.h<br>
--- a/source/test/pixelharness.h        Tue Nov 12 17:06:34 2013 +0530<br>
+++ b/source/test/pixelharness.h        Tue Nov 12 19:10:23 2013 +0530<br>
@@ -60,6 +60,8 @@<br>
     bool check_block_copy_ps(copy_ps_t ref, copy_ps_t opt);<br>
<br>
     bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);<br>
+<br>
+    bool check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt);<br>
 public:<br>
<br>
     PixelHarness();<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>