<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><DIV>Sure<BR><BR></DIV>ÔÚ 2013-11-14 22:19:07£¬"Steve Borho" <steve@borho.org> дµÀ£º<BR>
<BLOCKQUOTE id="isReplyContent" style="PADDING-LEFT: 1ex; MARGIN: 0px 0px 0px 0.8ex; BORDER-LEFT: #ccc 1px solid">
<DIV dir="ltr"><BR>
<DIV class="gmail_extra"><BR><BR>
<DIV class="gmail_quote">On Thu, Nov 14, 2013 at 2:47 AM, Min Chen <SPAN dir="ltr"><<A href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</A>></SPAN> wrote:<BR>
<BLOCKQUOTE class="gmail_quote" style="PADDING-LEFT: 1ex; MARGIN: 0px 0px 0px 0.8ex; BORDER-LEFT: #ccc 1px solid"># HG changeset patch<BR># User Min Chen <<A href="mailto:chenm003@163.com">chenm003@163.com</A>><BR># Date 1384418703 -28800<BR># Node ID 8e22129119d6d8049996ed5f487625e4801b0a50<BR># Parent d80ab2913b31e678334fb0941066c313dcb2d3b7<BR>asm: assembly code for calcrecon[]<BR></BLOCKQUOTE>
<DIV><BR></DIV>
<DIV>the EOLN changes to pixel-util.asm are not making it through your email client properly, so this patch is not importing. Can you send this to me as a patch file?</DIV>
<DIV> </DIV>
<BLOCKQUOTE class="gmail_quote" style="PADDING-LEFT: 1ex; MARGIN: 0px 0px 0px 0.8ex; BORDER-LEFT: #ccc 1px solid">diff -r d80ab2913b31 -r 8e22129119d6 source/common/x86/asm-primitives.cpp<BR>--- a/source/common/x86/asm-primitives.cpp Wed Nov 13 14:30:22 2013 +0530<BR>+++ b/source/common/x86/asm-primitives.cpp Thu Nov 14 16:45:03 2013 +0800<BR>@@ -450,6 +450,8 @@<BR><BR> p.cvt32to16_shr = x265_cvt32to16_shr_sse2;<BR> p.ipfilter_ss[FILTER_V_S_S_8] = x265_interp_8tap_v_ss_sse2;<BR>+ p.calcrecon[BLOCK_4x4] = x265_calcRecons4_sse2;<BR>+ p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;<BR> }<BR> if (cpuMask & X265_CPU_SSSE3)<BR> {<BR>@@ -525,6 +527,9 @@<BR> p.chroma_copy_sp[CHROMA_2x4] = x265_blockcopy_sp_2x4_sse4;<BR> p.chroma_copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4;<BR> p.chroma_copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4;<BR>+ p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse4;<BR>+ p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse4;<BR>+ p.calcrecon[BLOCK_64x64] = x265_calcRecons64_sse4;<BR> }<BR> if (cpuMask & X265_CPU_AVX)<BR> {<BR>diff -r d80ab2913b31 -r 8e22129119d6 source/common/x86/pixel-util.asm<BR>--- a/source/common/x86/pixel-util.asm Wed Nov 13 14:30:22 2013 +0530<BR>+++ b/source/common/x86/pixel-util.asm Thu Nov 14 16:45:03 2013 +0800<BR>@@ -1,103 +1,475 @@<BR>-;*****************************************************************************<BR><BR>-;* Copyright (C) 2013 x265 project<BR><BR>-;*<BR><BR>-;* Authors: Min Chen <<A href="mailto:chenm003@163.com">chenm003@163.com</A>> <<A href="mailto:min.chen@multicorewareinc.com">min.chen@multicorewareinc.com</A>><BR><BR>-;*<BR><BR>-;* This program is free software; you can redistribute it and/or modify<BR><BR>-;* it under the terms of the GNU General Public License as published by<BR><BR>-;* the Free Software Foundation; either version 2 of the License, or<BR><BR>-;* (at your option) any later version.<BR><BR>-;*<BR><BR>-;* This program is distributed in the hope that it will be useful,<BR><BR>-;* but WITHOUT ANY WARRANTY; without even the implied warranty of<BR><BR>-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<BR><BR>-;* GNU General Public License for more details.<BR><BR>-;*<BR><BR>-;* You should have received a copy of the GNU General Public License<BR><BR>-;* along with this program; if not, write to the Free Software<BR><BR>-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<BR><BR>-;*<BR><BR>-;* This program is also available under a commercial proprietary license.<BR><BR>-;* For more information, contact us at <A href="mailto:licensing@multicorewareinc.com">licensing@multicorewareinc.com</A>.<BR><BR>-;*****************************************************************************/<BR><BR>-<BR><BR>-%include "x86inc.asm"<BR><BR>-%include "x86util.asm"<BR><BR>-<BR><BR>-SECTION_RODATA 32<BR><BR>-<BR><BR>-SECTION .text<BR><BR>-<BR><BR>-<BR><BR>-;-----------------------------------------------------------------------------<BR><BR>-; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)<BR><BR>-;-----------------------------------------------------------------------------<BR><BR>-INIT_XMM sse2<BR><BR>-cglobal cvt32to16_shr, 5, 7, 1, dst, src, stride<BR><BR>-%define rnd m7<BR><BR>-%define shift m6<BR><BR>-<BR><BR>- ; make shift<BR><BR>- mov r5d, r3m<BR><BR>- movd shift, r5d<BR><BR>-<BR><BR>- ; make round<BR><BR>- dec r5<BR><BR>- xor r6, r6<BR><BR>- bts r6, r5<BR><BR>-<BR><BR>- movd rnd, r6d<BR><BR>- pshufd rnd, rnd, 0<BR><BR>-<BR><BR>- ; register alloc<BR><BR>- ; r0 - dst<BR><BR>- ; r1 - src<BR><BR>- ; r2 - stride * 2 (short*)<BR><BR>- ; r3 - lx<BR><BR>- ; r4 - size<BR><BR>- ; r5 - ly<BR><BR>- ; r6 - diff<BR><BR>- lea r2, [r2 * 2]<BR><BR>-<BR><BR>- mov r4d, r4m<BR><BR>- mov r5, r4<BR><BR>- mov r6, r2<BR><BR>- sub r6, r4<BR><BR>- lea r6, [r6 * 2]<BR><BR>-<BR><BR>- shr r5, 1<BR><BR>-.loop_row:<BR><BR>-<BR><BR>- mov r3, r4<BR><BR>- shr r3, 2<BR><BR>-.loop_col:<BR><BR>- ; row 0<BR><BR>- movu m0, [r1]<BR><BR>- paddd m0, rnd<BR><BR>- psrad m0, shift<BR><BR>- packssdw m0, m0<BR><BR>- movh [r0], m0<BR><BR>-<BR><BR>- ; row 1<BR><BR>- movu m0, [r1 + r4 * 4]<BR><BR>- paddd m0, rnd<BR><BR>- psrad m0, shift<BR><BR>- packssdw m0, m0<BR><BR>- movh [r0 + r2], m0<BR><BR>-<BR><BR>- ; move col pointer<BR><BR>- add r1, 16<BR><BR>- add r0, 8<BR><BR>-<BR><BR>- dec r3<BR><BR>- jg .loop_col<BR><BR>-<BR><BR>- ; update pointer<BR><BR>- lea r1, [r1 + r4 * 4]<BR><BR>- add r0, r6<BR><BR>-<BR><BR>- ; end of loop_row<BR><BR>- dec r5<BR><BR>- jg .loop_row<BR><BR>-<BR><BR>- RET<BR><BR>+;*****************************************************************************<BR>+;* Copyright (C) 2013 x265 project<BR>+;*<BR>+;* Authors: Min Chen <<A href="mailto:chenm003@163.com">chenm003@163.com</A>> <<A href="mailto:min.chen@multicorewareinc.com">min.chen@multicorewareinc.com</A>><BR>+;*<BR>+;* This program is free software; you can redistribute it and/or modify<BR>+;* it under the terms of the GNU General Public License as published by<BR>+;* the Free Software Foundation; either version 2 of the License, or<BR>+;* (at your option) any later version.<BR>+;*<BR>+;* This program is distributed in the hope that it will be useful,<BR>+;* but WITHOUT ANY WARRANTY; without even the implied warranty of<BR>+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<BR>+;* GNU General Public License for more details.<BR>+;*<BR>+;* You should have received a copy of the GNU General Public License<BR>+;* along with this program; if not, write to the Free Software<BR>+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<BR>+;*<BR>+;* This program is also available under a commercial proprietary license.<BR>+;* For more information, contact us at <A href="mailto:licensing@multicorewareinc.com">licensing@multicorewareinc.com</A>.<BR>+;*****************************************************************************/<BR>+<BR>+%include "x86inc.asm"<BR>+%include "x86util.asm"<BR>+<BR>+SECTION_RODATA 32<BR>+<BR>+SECTION .text<BR>+<BR>+<BR>+;-----------------------------------------------------------------------------<BR>+; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)<BR>+;-----------------------------------------------------------------------------<BR>+INIT_XMM sse2<BR>+cglobal cvt32to16_shr, 5, 7, 1, dst, src, stride<BR>+%define rnd m7<BR>+%define shift m6<BR>+<BR>+ ; make shift<BR>+ mov r5d, r3m<BR>+ movd shift, r5d<BR>+<BR>+ ; make round<BR>+ dec r5<BR>+ xor r6, r6<BR>+ bts r6, r5<BR>+<BR>+ movd rnd, r6d<BR>+ pshufd rnd, rnd, 0<BR>+<BR>+ ; register alloc<BR>+ ; r0 - dst<BR>+ ; r1 - src<BR>+ ; r2 - stride * 2 (short*)<BR>+ ; r3 - lx<BR>+ ; r4 - size<BR>+ ; r5 - ly<BR>+ ; r6 - diff<BR>+ lea r2, [r2 * 2]<BR>+<BR>+ mov r4d, r4m<BR>+ mov r5, r4<BR>+ mov r6, r2<BR>+ sub r6, r4<BR>+ lea r6, [r6 * 2]<BR>+<BR>+ shr r5, 1<BR>+.loop_row:<BR>+<BR>+ mov r3, r4<BR>+ shr r3, 2<BR>+.loop_col:<BR>+ ; row 0<BR>+ movu m0, [r1]<BR>+ paddd m0, rnd<BR>+ psrad m0, shift<BR>+ packssdw m0, m0<BR>+ movh [r0], m0<BR>+<BR>+ ; row 1<BR>+ movu m0, [r1 + r4 * 4]<BR>+ paddd m0, rnd<BR>+ psrad m0, shift<BR>+ packssdw m0, m0<BR>+ movh [r0 + r2], m0<BR>+<BR>+ ; move col pointer<BR>+ add r1, 16<BR>+ add r0, 8<BR>+<BR>+ dec r3<BR>+ jg .loop_col<BR>+<BR>+ ; update pointer<BR>+ lea r1, [r1 + r4 * 4]<BR>+ add r0, r6<BR>+<BR>+ ; end of loop_row<BR>+ dec r5<BR>+ jg .loop_row<BR>+<BR>+ RET<BR>+<BR>+<BR>+;-----------------------------------------------------------------------------<BR>+; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)<BR>+;-----------------------------------------------------------------------------<BR>+INIT_XMM sse2<BR>+cglobal calcRecons4<BR>+%if ARCH_X86_64 == 1<BR>+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<BR>+ PROLOGUE 6,9,4<BR>+%else<BR>+ DECLARE_REG_TMP 0,1,2,3,4,5<BR>+ PROLOGUE 6,7,4<BR>+ %define t6 r6m<BR>+ %define t6d r6d<BR>+ %define t7 r7m<BR>+ %define t8d r6d<BR>+%endif<BR>+<BR>+ mov t6d, r6m<BR>+%if ARCH_X86_64 == 0<BR>+ add t6d, t6d<BR>+ mov r6m, t6d<BR>+%else<BR>+ mov r5d, r5m<BR>+ mov r7d, r7m<BR>+ add t6d, t6d<BR>+%endif<BR>+<BR>+ pxor m0, m0<BR>+ mov t8d, 4/2<BR>+.loop:<BR>+ movd m1, [t0]<BR>+ movd m2, [t0 + t5]<BR>+ punpckldq m1, m2<BR>+ punpcklbw m1, m0<BR>+ movh m2, [t1]<BR>+ movh m3, [t1 + t5 * 2]<BR>+ punpcklqdq m2, m3<BR>+ paddw m1, m2<BR>+ packuswb m1, m1<BR>+<BR>+ ; store recon[] and recipred[]<BR>+ movd [t2], m1<BR>+ movd [t4], m1<BR>+ add t4, t7<BR>+ pshufd m2, m1, 1<BR>+ movd [t2 + t5], m2<BR>+ movd [t4], m2<BR>+ add t4, t7<BR>+<BR>+ ; store recqt[]<BR>+ punpcklbw m1, m0<BR>+ movlps [t3], m1<BR>+ add t3, t6<BR>+ movhps [t3], m1<BR>+ add t3, t6<BR>+<BR>+ lea t0, [t0 + t5 * 2]<BR>+ lea t1, [t1 + t5 * 4]<BR>+ lea t2, [t2 + t5 * 2]<BR>+<BR>+ dec t8d<BR>+ jnz .loop<BR>+ RET<BR>+<BR>+<BR>+INIT_XMM sse2<BR>+cglobal calcRecons8<BR>+%if ARCH_X86_64 == 1<BR>+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<BR>+ PROLOGUE 6,9,5<BR>+%else<BR>+ DECLARE_REG_TMP 0,1,2,3,4,5<BR>+ PROLOGUE 6,7,5<BR>+ %define t6 r6m<BR>+ %define t6d r6d<BR>+ %define t7 r7m<BR>+ %define t8d r6d<BR>+%endif<BR>+<BR>+ mov t6d, r6m<BR>+%if ARCH_X86_64 == 0<BR>+ add t6d, t6d<BR>+ mov r6m, t6d<BR>+%else<BR>+ mov r5d, r5m<BR>+ mov r7d, r7m<BR>+ add t6d, t6d<BR>+%endif<BR>+<BR>+ pxor m0, m0<BR>+ mov t8d, 8/2<BR>+.loop:<BR>+ movh m1, [t0]<BR>+ movh m2, [t0 + t5]<BR>+ punpcklbw m1, m0<BR>+ punpcklbw m2, m0<BR>+ movu m3, [t1]<BR>+ movu m4, [t1 + t5 * 2]<BR>+ paddw m1, m3<BR>+ paddw m2, m4<BR>+ packuswb m1, m2<BR>+<BR>+ ; store recon[] and recipred[]<BR>+ movlps [t2], m1<BR>+ movhps [t2 + t5], m1<BR>+ movlps [t4], m1<BR>+%if ARCH_X86_64 == 0<BR>+ add t4, t7<BR>+ movhps [t4], m1<BR>+ add t4, t7<BR>+%else<BR>+ movhps [t4 + t7], m1<BR>+ lea t4, [t4 + t7 * 2]<BR>+%endif<BR>+<BR>+ ; store recqt[]<BR>+ punpcklbw m2, m1, m0<BR>+ punpckhbw m1, m0<BR>+ movu [t3], m2<BR>+ add t3, t6<BR>+ movu [t3], m1<BR>+ add t3, t6<BR>+<BR>+ lea t0, [t0 + t5 * 2]<BR>+ lea t1, [t1 + t5 * 4]<BR>+ lea t2, [t2 + t5 * 2]<BR>+<BR>+ dec t8d<BR>+ jnz .loop<BR>+ RET<BR>+<BR>+<BR>+INIT_XMM sse4<BR>+cglobal calcRecons16<BR>+%if ARCH_X86_64 == 1<BR>+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<BR>+ PROLOGUE 6,9,5<BR>+%else<BR>+ DECLARE_REG_TMP 0,1,2,3,4,5<BR>+ PROLOGUE 6,7,5<BR>+ %define t6 r6m<BR>+ %define t6d r6d<BR>+ %define t7 r7m<BR>+ %define t8d r6d<BR>+%endif<BR>+<BR>+ mov t6d, r6m<BR>+%if ARCH_X86_64 == 0<BR>+ add t6d, t6d<BR>+ mov r6m, t6d<BR>+%else<BR>+ mov r5d, r5m<BR>+ mov r7d, r7m<BR>+ add t6d, t6d<BR>+%endif<BR>+<BR>+ pxor m0, m0<BR>+ mov t8d, 16<BR>+.loop:<BR>+ movu m2, [t0]<BR>+ pmovzxbw m1, m2<BR>+ punpckhbw m2, m0<BR>+ movu m3, [t1]<BR>+ movu m4, [t1 + 16]<BR>+ paddw m1, m3<BR>+ paddw m2, m4<BR>+ packuswb m1, m2<BR>+<BR>+ ; store recon[] and recipred[]<BR>+ movu [t2], m1<BR>+ movu [t4], m1<BR>+<BR>+ ; store recqt[]<BR>+ pmovzxbw m2, m1<BR>+ punpckhbw m1, m0<BR>+ movu [t3], m2<BR>+ movu [t3 + 16], m1<BR>+<BR>+ add t3, t6<BR>+ add t4, t7<BR>+ add t0, t5<BR>+ lea t1, [t1 + t5 * 2]<BR>+ add t2, t5<BR>+<BR>+ dec t8d<BR>+ jnz .loop<BR>+ RET<BR>+<BR>+<BR>+INIT_XMM sse4<BR>+cglobal calcRecons32<BR>+%if ARCH_X86_64 == 1<BR>+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<BR>+ PROLOGUE 6,9,7<BR>+%else<BR>+ DECLARE_REG_TMP 0,1,2,3,4,5<BR>+ PROLOGUE 6,7,7<BR>+ %define t6 r6m<BR>+ %define t6d r6d<BR>+ %define t7 r7m<BR>+ %define t8d r6d<BR>+%endif<BR>+<BR>+ mov t6d, r6m<BR>+%if ARCH_X86_64 == 0<BR>+ add t6d, t6d<BR>+ mov r6m, t6d<BR>+%else<BR>+ mov r5d, r5m<BR>+ mov r7d, r7m<BR>+ add t6d, t6d<BR>+%endif<BR>+<BR>+ pxor m0, m0<BR>+ mov t8d, 32<BR>+.loop:<BR>+ movu m2, [t0]<BR>+ movu m4, [t0 + 16]<BR>+ pmovzxbw m1, m2<BR>+ punpckhbw m2, m0<BR>+ pmovzxbw m3, m4<BR>+ punpckhbw m4, m0<BR>+<BR>+ movu m5, [t1 + 0 * 16]<BR>+ movu m6, [t1 + 1 * 16]<BR>+ paddw m1, m5<BR>+ paddw m2, m6<BR>+ packuswb m1, m2<BR>+<BR>+ movu m5, [t1 + 2 * 16]<BR>+ movu m6, [t1 + 3 * 16]<BR>+ paddw m3, m5<BR>+ paddw m4, m6<BR>+ packuswb m3, m4<BR>+<BR>+ ; store recon[] and recipred[]<BR>+ movu [t2], m1<BR>+ movu [t2 + 16], m3<BR>+ movu [t4], m1<BR>+ movu [t4 + 16], m3<BR>+<BR>+ ; store recqt[]<BR>+ pmovzxbw m2, m1<BR>+ punpckhbw m1, m0<BR>+ movu [t3 + 0 * 16], m2<BR>+ movu [t3 + 1 * 16], m1<BR>+ pmovzxbw m4, m3<BR>+ punpckhbw m3, m0<BR>+ movu [t3 + 2 * 16], m4<BR>+ movu [t3 + 3 * 16], m3<BR>+<BR>+ add t3, t6<BR>+ add t4, t7<BR>+ add t0, t5<BR>+ lea t1, [t1 + t5 * 2]<BR>+ add t2, t5<BR>+<BR>+ dec t8d<BR>+ jnz .loop<BR>+ RET<BR>+<BR>+<BR>+INIT_XMM sse4<BR>+cglobal calcRecons64<BR>+%if ARCH_X86_64 == 1<BR>+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<BR>+ PROLOGUE 6,9,7<BR>+%else<BR>+ DECLARE_REG_TMP 0,1,2,3,4,5<BR>+ PROLOGUE 6,7,7<BR>+ %define t6 r6m<BR>+ %define t6d r6d<BR>+ %define t7 r7m<BR>+ %define t8d r6d<BR>+%endif<BR>+<BR>+ mov t6d, r6m<BR>+%if ARCH_X86_64 == 0<BR>+ add t6d, t6d<BR>+ mov r6m, t6d<BR>+%else<BR>+ mov r5d, r5m<BR>+ mov r7d, r7m<BR>+ add t6d, t6d<BR>+%endif<BR>+<BR>+ pxor m0, m0<BR>+ mov t8d, 64<BR>+.loop:<BR>+ ; left 32 pixel<BR>+ movu m2, [t0 + 0 * 16]<BR>+ movu m4, [t0 + 1 * 16]<BR>+ pmovzxbw m1, m2<BR>+ punpckhbw m2, m0<BR>+ pmovzxbw m3, m4<BR>+ punpckhbw m4, m0<BR>+<BR>+ movu m5, [t1 + 0 * 16]<BR>+ movu m6, [t1 + 1 * 16]<BR>+ paddw m1, m5<BR>+ paddw m2, m6<BR>+ packuswb m1, m2<BR>+<BR>+ movu m5, [t1 + 2 * 16]<BR>+ movu m6, [t1 + 3 * 16]<BR>+ paddw m3, m5<BR>+ paddw m4, m6<BR>+ packuswb m3, m4<BR>+<BR>+ ; store recon[] and recipred[]<BR>+ movu [t2 + 0 * 16], m1<BR>+ movu [t2 + 1 * 16], m3<BR>+ movu [t4 + 0 * 16], m1<BR>+ movu [t4 + 1 * 16], m3<BR>+<BR>+ ; store recqt[]<BR>+ pmovzxbw m2, m1<BR>+ punpckhbw m1, m0<BR>+ movu [t3 + 0 * 16], m2<BR>+ movu [t3 + 1 * 16], m1<BR>+ pmovzxbw m4, m3<BR>+ punpckhbw m3, m0<BR>+ movu [t3 + 2 * 16], m4<BR>+ movu [t3 + 3 * 16], m3<BR>+<BR>+ ; right 32 pixel<BR>+ movu m2, [t0 + 2 * 16]<BR>+ movu m4, [t0 + 3 * 16]<BR>+ pmovzxbw m1, m2<BR>+ punpckhbw m2, m0<BR>+ pmovzxbw m3, m4<BR>+ punpckhbw m4, m0<BR>+<BR>+ movu m5, [t1 + 4 * 16]<BR>+ movu m6, [t1 + 5 * 16]<BR>+ paddw m1, m5<BR>+ paddw m2, m6<BR>+ packuswb m1, m2<BR>+<BR>+ movu m5, [t1 + 6 * 16]<BR>+ movu m6, [t1 + 7 * 16]<BR>+ paddw m3, m5<BR>+ paddw m4, m6<BR>+ packuswb m3, m4<BR>+<BR>+ ; store recon[] and recipred[]<BR>+ movu [t2 + 2 * 16], m1<BR>+ movu [t2 + 3 * 16], m3<BR>+ movu [t4 + 2 * 16], m1<BR>+ movu [t4 + 3 * 16], m3<BR>+<BR>+ ; store recqt[]<BR>+ pmovzxbw m2, m1<BR>+ punpckhbw m1, m0<BR>+ movu [t3 + 4 * 16], m2<BR>+ movu [t3 + 5 * 16], m1<BR>+ pmovzxbw m4, m3<BR>+ punpckhbw m3, m0<BR>+ movu [t3 + 6 * 16], m4<BR>+ movu [t3 + 7 * 16], m3<BR>+<BR>+ add t3, t6<BR>+ add t4, t7<BR>+ add t0, t5<BR>+ lea t1, [t1 + t5 * 2]<BR>+ add t2, t5<BR>+<BR>+ dec t8d<BR>+ jnz .loop<BR>+ RET<BR>diff -r d80ab2913b31 -r 8e22129119d6 source/common/x86/pixel.h<BR>--- a/source/common/x86/pixel.h Wed Nov 13 14:30:22 2013 +0530<BR>+++ b/source/common/x86/pixel.h Thu Nov 14 16:45:03 2013 +0800<BR>@@ -355,4 +355,10 @@<BR> #undef CHROMA_PIXELSUB_DEF<BR> #undef LUMA_PIXELSUB_DEF<BR><BR>+void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);<BR>+void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);<BR>+void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);<BR>+void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);<BR>+void x265_calcRecons64_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);<BR>+<BR> #endif // ifndef X265_I386_PIXEL_H<BR>diff -r d80ab2913b31 -r 8e22129119d6 source/test/pixelharness.cpp<BR>--- a/source/test/pixelharness.cpp Wed Nov 13 14:30:22 2013 +0530<BR>+++ b/source/test/pixelharness.cpp Thu Nov 14 16:45:03 2013 +0800<BR>@@ -286,8 +286,8 @@<BR> for (int i = 0; i < ITERS; i++)<BR> {<BR> int stride = STRIDE;<BR>+ ref(pbuf1 + j, sbuf1 + j, ref_reco, ref_recq, ref_pred, stride, stride, stride);<BR> opt(pbuf1 + j, sbuf1 + j, opt_reco, opt_recq, opt_pred, stride, stride, stride);<BR>- ref(pbuf1 + j, sbuf1 + j, ref_reco, ref_recq, ref_pred, stride, stride, stride);<BR><BR> if (memcmp(ref_recq, opt_recq, 64 * 64 * sizeof(int16_t)))<BR> return false;<BR><BR>_______________________________________________<BR>x265-devel mailing list<BR><A href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</A><BR><A href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</A><BR></BLOCKQUOTE></DIV><BR><BR clear="all">
<DIV><BR></DIV>-- <BR>Steve Borho </DIV></DIV></BLOCKQUOTE></div>