<div dir="ltr">Merged, sent implementation.<div><br></div><div>Regards,</div><div>Praveen Tiwari<br><div><br></div><div><br></div></div></div><div class="gmail_extra"><br><br><div class="gmail_quote">On Wed, Nov 20, 2013 at 6:08 PM, chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br>

<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div style="line-height:1.7;font-size:14px;font-family:arial"><pre>At 2013-11-20 19:45:24,<a href="mailto:praveen@multicorewareinc.com" target="_blank">praveen@multicorewareinc.com</a> wrote:

># HG changeset patch

># User Praveen Tiwari

># Date 1384947915 -19800

># Node ID c1e556f54d61422d153ff67f4830dc62dd1111d9

># Parent  a7fb47a7eddf18634449a5ac898f7c2d029048e9

>asm code for pixeladd_ps_4x4 and testbench integration

>

>diff -r a7fb47a7eddf -r c1e556f54d61 source/common/CMakeLists.txt

>--- a/source/common/CMakeLists.txt  Wed Nov 20 12:57:57 2013 +0530

>+++ b/source/common/CMakeLists.txt  Wed Nov 20 17:15:15 2013 +0530

>@@ -113,7 +113,7 @@

> 

> if(ENABLE_PRIMITIVES_ASM)

>     set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h)

>-    set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm blockcopy8.asm)

>+    set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm blockcopy8.asm pixeladd8.asm)

>     if (NOT X64)

>         set(A_SRCS ${A_SRCS} pixel-32.asm)

>     endif()

>diff -r a7fb47a7eddf -r c1e556f54d61 source/common/x86/asm-primitives.cpp

>--- a/source/common/x86/asm-primitives.cpp  Wed Nov 20 12:57:57 2013 +0530

>+++ b/source/common/x86/asm-primitives.cpp  Wed Nov 20 17:15:15 2013 +0530

>@@ -633,6 +633,13 @@

>         p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse4;

>         p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;

>         p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;

>+

>+        // This function pointer initialization is temporary will be removed

>+        // later with macro definitions.  It is used to avoid linker errors

>+        // until all partitions are coded and commit smaller patches, easier to

>+        // review.

>+

>+        p.chroma_add_ps[X265_CSP_I420][CHROMA_4x4] = x265_pixel_add_ps_4x4_sse4;

>     }

>     if (cpuMask & X265_CPU_AVX)

>     {

>diff -r a7fb47a7eddf -r c1e556f54d61 source/common/x86/pixel.h

>--- a/source/common/x86/pixel.h     Wed Nov 20 12:57:57 2013 +0530

>+++ b/source/common/x86/pixel.h     Wed Nov 20 17:15:15 2013 +0530

>@@ -313,7 +313,8 @@

>     SETUP_CHROMA_PIXELSUB_PS_FUNC(8, 32, cpu);

> 

> #define SETUP_LUMA_PIXELSUB_PS_FUNC(W, H, cpu) \

>-    void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);

>+    void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);\

>+    void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel *dest, int destride, pixel *src0, int16_t *scr1, int srcStride0, int srcStride1);

> 

> #define LUMA_PIXELSUB_DEF(cpu) \

>     SETUP_LUMA_PIXELSUB_PS_FUNC(4,   4, cpu); \

>@@ -342,6 +343,8 @@

>     SETUP_LUMA_PIXELSUB_PS_FUNC(64, 16, cpu); \

>     SETUP_LUMA_PIXELSUB_PS_FUNC(16, 64, cpu);

> 

>+//    void x265_pixeladd_ps_4x4_sse4(pixel *dest, int destride, pixel *src0, int16_t *scr1, int srcStride0, int srcStride1);

>+

remove unused line</pre><pre> </pre><pre><div><div class="h5">> CHROMA_PIXELSUB_DEF(_sse4);

> LUMA_PIXELSUB_DEF(_sse4);

> 

>diff -r a7fb47a7eddf -r c1e556f54d61 source/common/x86/pixeladd8.asm

>--- /dev/null       Thu Jan 01 00:00:00 1970 +0000

>+++ b/source/common/x86/pixeladd8.asm       Wed Nov 20 17:15:15 2013 +0530

>@@ -0,0 +1,79 @@

>+;*****************************************************************************

>+;* Copyright (C) 2013 x265 project

>+;*

>+;* Authors: Praveen Kumar Tiwari <<a href="mailto:praveen@multicorewareinc.com" target="_blank">praveen@multicorewareinc.com</a>>

>+;*

>+;* This program is free software; you can redistribute it and/or modify

>+;* it under the terms of the GNU General Public License as published by

>+;* the Free Software Foundation; either version 2 of the License, or

>+;* (at your option) any later version.

>+;*

>+;* This program is distributed in the hope that it will be useful,

>+;* but WITHOUT ANY WARRANTY; without even the implied warranty of

>+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

>+;* GNU General Public License for more details.

>+;*

>+;* You should have received a copy of the GNU General Public License

>+;* along with this program; if not, write to the Free Software

>+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.

>+;*

>+;* This program is also available under a commercial proprietary license.

>+;* For more information, contact us at <a href="mailto:licensing@multicorewareinc.com" target="_blank">licensing@multicorewareinc.com</a>.

>+;*****************************************************************************/

>+

>+%include "x86inc.asm"

>+%include "x86util.asm"

>+

>+SECTION_RODATA 32

>+

>+SECTION .text

>+

>+;-----------------------------------------------------------------------------

>+; void pixel_add_ps_4x4(pixel *dest, int destride, pixel *src0, int16_t *scr1, int srcStride0, int srcStride1)

>+;-----------------------------------------------------------------------------

>+INIT_XMM sse4

>+cglobal pixel_add_ps_4x4, 6, 6, 2, dest, destride, src0, scr1, srcStride0, srcStride1

>+

>+add         r5,            r5

>+

>+movd        m0,            [r2]

>+pmovzxbw    m0,            m0

>+movh        m1,            [</div></div>r3]

we can merge movd and pmovzxbw, in Intel documents, this instruction is not need alignment to 16-bytes bound</pre><pre><div><div class="h5">>+

>+paddw       m0,            m1

>+packuswb    m0,            m0

>+

>+movd        [r0],          m0

>+

>+movd        m0,            [r2 + r4]

>+pmovzxbw    m0,            m0

>+movh        m1,            [r3 + r5]

>+

>+paddw       m0,            m1

>+packuswb    m0,            m0

>+

>+movd        [r0 + r1],     m0

>+

>+movd        m0,            [r2 + 2 * r4]

>+pmovzxbw    m0,            m0

>+movh        m1,            [r3 + 2 * r5]

>+

>+paddw       m0,            m1

>+packuswb    m0,            m0

>+

>+movd        [r0 + 2 * r1], m0

>+

>+lea         r0,            [r0 + 2 * r1]

>+lea         r2,            [r2 + 2 * r4]

>+lea         r3,            [r3 + 2 * r5]

>+

>+movd        m0,            [r2 + r4]

>+pmovzxbw    m0,            m0

>+movh        m1,            [r3 + r5]

>+

>+paddw       m0,            m1

>+packuswb    m0,            m0

>+

>+movd        [r0 + r1],     </div></div>m0

>+

>+RET

>_______________________________________________

>x265-devel mailing list

><a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a>

><a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a>

</pre></div><br>_______________________________________________<br>

x265-devel mailing list<br>

<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>

<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>

<br></blockquote></div><br></div>