<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Thu, Nov 14, 2013 at 11:18 PM, Min Chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>
# Date 1384492636 -28800<br>
# Node ID 9592525e376f4b41d4fde000ae77814a00b06822<br>
# Parent ee42f57411ae746095dd36e36064145ed869d73c<br>
asm: residual buffer is alignment to size, so we can use alignment load instruction<br></blockquote><div><br></div><div><br></div><div>this patch is malformed, can you send it as a patch file? </div><div><br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
diff -r ee42f57411ae -r 9592525e376f source/Lib/TLibEncoder/TEncSearch.cpp<br>
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Nov 14 13:38:07 2013 -0600<br>
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Nov 15 13:17:16 2013 +0800<br>
@@ -502,6 +502,8 @@<br>
}<br>
<br>
//===== reconstruction =====<br>
+ assert(((uint32_t)residual & (width - 1)) == 0);<br>
+ assert(width <= 32);<br>
primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);<br>
<br>
//===== update distortion =====<br>
@@ -636,6 +638,8 @@<br>
}<br>
<br>
//===== reconstruction =====<br>
+ assert(((uint32_t)residual & (width - 1)) == 0);<br>
+ assert(width <= 32);<br>
primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE / 2, reconIPredStride);<br>
<br>
//===== update distortion =====<br>
diff -r ee42f57411ae -r 9592525e376f source/common/x86/pixel-util.asm<br>
--- a/source/common/x86/pixel-util.asm Thu Nov 14 13:38:07 2013 -0600<br>
+++ b/source/common/x86/pixel-util.asm Fri Nov 15 13:17:16 2013 +0800<br>
@@ -1,475 +1,469 @@<br>
-;*****************************************************************************<br>
<br>
-;* Copyright (C) 2013 x265 project<br>
<br>
-;*<br>
<br>
-;* Authors: Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>> <<a href="mailto:min.chen@multicorewareinc.com">min.chen@multicorewareinc.com</a>><br>
<br>
-;*<br>
<br>
-;* This program is free software; you can redistribute it and/or modify<br>
<br>
-;* it under the terms of the GNU General Public License as published by<br>
<br>
-;* the Free Software Foundation; either version 2 of the License, or<br>
<br>
-;* (at your option) any later version.<br>
<br>
-;*<br>
<br>
-;* This program is distributed in the hope that it will be useful,<br>
<br>
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
<br>
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
<br>
-;* GNU General Public License for more details.<br>
<br>
-;*<br>
<br>
-;* You should have received a copy of the GNU General Public License<br>
<br>
-;* along with this program; if not, write to the Free Software<br>
<br>
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
<br>
-;*<br>
<br>
-;* This program is also available under a commercial proprietary license.<br>
<br>
-;* For more information, contact us at <a href="mailto:licensing@multicorewareinc.com">licensing@multicorewareinc.com</a>.<br>
<br>
-;*****************************************************************************/<br>
<br>
-<br>
<br>
-%include "x86inc.asm"<br>
<br>
-%include "x86util.asm"<br>
<br>
-<br>
<br>
-SECTION_RODATA 32<br>
<br>
-<br>
<br>
-SECTION .text<br>
<br>
-<br>
<br>
-<br>
<br>
-;-----------------------------------------------------------------------------<br>
<br>
-; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)<br>
<br>
-;-----------------------------------------------------------------------------<br>
<br>
-INIT_XMM sse2<br>
<br>
-cglobal cvt32to16_shr, 5, 7, 1, dst, src, stride<br>
<br>
-%define rnd m7<br>
<br>
-%define shift m6<br>
<br>
-<br>
<br>
- ; make shift<br>
<br>
- mov r5d, r3m<br>
<br>
- movd shift, r5d<br>
<br>
-<br>
<br>
- ; make round<br>
<br>
- dec r5<br>
<br>
- xor r6, r6<br>
<br>
- bts r6, r5<br>
<br>
-<br>
<br>
- movd rnd, r6d<br>
<br>
- pshufd rnd, rnd, 0<br>
<br>
-<br>
<br>
- ; register alloc<br>
<br>
- ; r0 - dst<br>
<br>
- ; r1 - src<br>
<br>
- ; r2 - stride * 2 (short*)<br>
<br>
- ; r3 - lx<br>
<br>
- ; r4 - size<br>
<br>
- ; r5 - ly<br>
<br>
- ; r6 - diff<br>
<br>
- lea r2, [r2 * 2]<br>
<br>
-<br>
<br>
- mov r4d, r4m<br>
<br>
- mov r5, r4<br>
<br>
- mov r6, r2<br>
<br>
- sub r6, r4<br>
<br>
- lea r6, [r6 * 2]<br>
<br>
-<br>
<br>
- shr r5, 1<br>
<br>
-.loop_row:<br>
<br>
-<br>
<br>
- mov r3, r4<br>
<br>
- shr r3, 2<br>
<br>
-.loop_col:<br>
<br>
- ; row 0<br>
<br>
- movu m0, [r1]<br>
<br>
- paddd m0, rnd<br>
<br>
- psrad m0, shift<br>
<br>
- packssdw m0, m0<br>
<br>
- movh [r0], m0<br>
<br>
-<br>
<br>
- ; row 1<br>
<br>
- movu m0, [r1 + r4 * 4]<br>
<br>
- paddd m0, rnd<br>
<br>
- psrad m0, shift<br>
<br>
- packssdw m0, m0<br>
<br>
- movh [r0 + r2], m0<br>
<br>
-<br>
<br>
- ; move col pointer<br>
<br>
- add r1, 16<br>
<br>
- add r0, 8<br>
<br>
-<br>
<br>
- dec r3<br>
<br>
- jg .loop_col<br>
<br>
-<br>
<br>
- ; update pointer<br>
<br>
- lea r1, [r1 + r4 * 4]<br>
<br>
- add r0, r6<br>
<br>
-<br>
<br>
- ; end of loop_row<br>
<br>
- dec r5<br>
<br>
- jg .loop_row<br>
<br>
-<br>
<br>
- RET<br>
<br>
-<br>
<br>
-<br>
<br>
-;-----------------------------------------------------------------------------<br>
<br>
-; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)<br>
<br>
-;-----------------------------------------------------------------------------<br>
<br>
-INIT_XMM sse2<br>
<br>
-cglobal calcRecons4<br>
<br>
-%if ARCH_X86_64 == 1<br>
<br>
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<br>
<br>
- PROLOGUE 6,9,4<br>
<br>
-%else<br>
<br>
- DECLARE_REG_TMP 0,1,2,3,4,5<br>
<br>
- PROLOGUE 6,7,4<br>
<br>
- %define t6 r6m<br>
<br>
- %define t6d r6d<br>
<br>
- %define t7 r7m<br>
<br>
- %define t8d r6d<br>
<br>
-%endif<br>
<br>
-<br>
<br>
- mov t6d, r6m<br>
<br>
-%if ARCH_X86_64 == 0<br>
<br>
- add t6d, t6d<br>
<br>
- mov r6m, t6d<br>
<br>
-%else<br>
<br>
- mov r5d, r5m<br>
<br>
- mov r7d, r7m<br>
<br>
- add t6d, t6d<br>
<br>
-%endif<br>
<br>
-<br>
<br>
- pxor m0, m0<br>
<br>
- mov t8d, 4/2<br>
<br>
-.loop:<br>
<br>
- movd m1, [t0]<br>
<br>
- movd m2, [t0 + t5]<br>
<br>
- punpckldq m1, m2<br>
<br>
- punpcklbw m1, m0<br>
<br>
- movh m2, [t1]<br>
<br>
- movh m3, [t1 + t5 * 2]<br>
<br>
- punpcklqdq m2, m3<br>
<br>
- paddw m1, m2<br>
<br>
- packuswb m1, m1<br>
<br>
-<br>
<br>
- ; store recon[] and recipred[]<br>
<br>
- movd [t2], m1<br>
<br>
- movd [t4], m1<br>
<br>
- add t4, t7<br>
<br>
- pshufd m2, m1, 1<br>
<br>
- movd [t2 + t5], m2<br>
<br>
- movd [t4], m2<br>
<br>
- add t4, t7<br>
<br>
-<br>
<br>
- ; store recqt[]<br>
<br>
- punpcklbw m1, m0<br>
<br>
- movlps [t3], m1<br>
<br>
- add t3, t6<br>
<br>
- movhps [t3], m1<br>
<br>
- add t3, t6<br>
<br>
-<br>
<br>
- lea t0, [t0 + t5 * 2]<br>
<br>
- lea t1, [t1 + t5 * 4]<br>
<br>
- lea t2, [t2 + t5 * 2]<br>
<br>
-<br>
<br>
- dec t8d<br>
<br>
- jnz .loop<br>
<br>
- RET<br>
<br>
-<br>
<br>
-<br>
<br>
-INIT_XMM sse2<br>
<br>
-cglobal calcRecons8<br>
<br>
-%if ARCH_X86_64 == 1<br>
<br>
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<br>
<br>
- PROLOGUE 6,9,5<br>
<br>
-%else<br>
<br>
- DECLARE_REG_TMP 0,1,2,3,4,5<br>
<br>
- PROLOGUE 6,7,5<br>
<br>
- %define t6 r6m<br>
<br>
- %define t6d r6d<br>
<br>
- %define t7 r7m<br>
<br>
- %define t8d r6d<br>
<br>
-%endif<br>
<br>
-<br>
<br>
- mov t6d, r6m<br>
<br>
-%if ARCH_X86_64 == 0<br>
<br>
- add t6d, t6d<br>
<br>
- mov r6m, t6d<br>
<br>
-%else<br>
<br>
- mov r5d, r5m<br>
<br>
- mov r7d, r7m<br>
<br>
- add t6d, t6d<br>
<br>
-%endif<br>
<br>
-<br>
<br>
- pxor m0, m0<br>
<br>
- mov t8d, 8/2<br>
<br>
-.loop:<br>
<br>
- movh m1, [t0]<br>
<br>
- movh m2, [t0 + t5]<br>
<br>
- punpcklbw m1, m0<br>
<br>
- punpcklbw m2, m0<br>
<br>
- movu m3, [t1]<br>
<br>
- movu m4, [t1 + t5 * 2]<br>
<br>
- paddw m1, m3<br>
<br>
- paddw m2, m4<br>
<br>
- packuswb m1, m2<br>
<br>
-<br>
<br>
- ; store recon[] and recipred[]<br>
<br>
- movlps [t2], m1<br>
<br>
- movhps [t2 + t5], m1<br>
<br>
- movlps [t4], m1<br>
<br>
-%if ARCH_X86_64 == 0<br>
<br>
- add t4, t7<br>
<br>
- movhps [t4], m1<br>
<br>
- add t4, t7<br>
<br>
-%else<br>
<br>
- movhps [t4 + t7], m1<br>
<br>
- lea t4, [t4 + t7 * 2]<br>
<br>
-%endif<br>
<br>
-<br>
<br>
- ; store recqt[]<br>
<br>
- punpcklbw m2, m1, m0<br>
<br>
- punpckhbw m1, m0<br>
<br>
- movu [t3], m2<br>
<br>
- add t3, t6<br>
<br>
- movu [t3], m1<br>
<br>
- add t3, t6<br>
<br>
-<br>
<br>
- lea t0, [t0 + t5 * 2]<br>
<br>
- lea t1, [t1 + t5 * 4]<br>
<br>
- lea t2, [t2 + t5 * 2]<br>
<br>
-<br>
<br>
- dec t8d<br>
<br>
- jnz .loop<br>
<br>
- RET<br>
<br>
-<br>
<br>
-<br>
<br>
-INIT_XMM sse4<br>
<br>
-cglobal calcRecons16<br>
<br>
-%if ARCH_X86_64 == 1<br>
<br>
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<br>
<br>
- PROLOGUE 6,9,5<br>
<br>
-%else<br>
<br>
- DECLARE_REG_TMP 0,1,2,3,4,5<br>
<br>
- PROLOGUE 6,7,5<br>
<br>
- %define t6 r6m<br>
<br>
- %define t6d r6d<br>
<br>
- %define t7 r7m<br>
<br>
- %define t8d r6d<br>
<br>
-%endif<br>
<br>
-<br>
<br>
- mov t6d, r6m<br>
<br>
-%if ARCH_X86_64 == 0<br>
<br>
- add t6d, t6d<br>
<br>
- mov r6m, t6d<br>
<br>
-%else<br>
<br>
- mov r5d, r5m<br>
<br>
- mov r7d, r7m<br>
<br>
- add t6d, t6d<br>
<br>
-%endif<br>
<br>
-<br>
<br>
- pxor m0, m0<br>
<br>
- mov t8d, 16<br>
<br>
-.loop:<br>
<br>
- movu m2, [t0]<br>
<br>
- pmovzxbw m1, m2<br>
<br>
- punpckhbw m2, m0<br>
<br>
- movu m3, [t1]<br>
<br>
- movu m4, [t1 + 16]<br>
<br>
- paddw m1, m3<br>
<br>
- paddw m2, m4<br>
<br>
- packuswb m1, m2<br>
<br>
-<br>
<br>
- ; store recon[] and recipred[]<br>
<br>
- movu [t2], m1<br>
<br>
- movu [t4], m1<br>
<br>
-<br>
<br>
- ; store recqt[]<br>
<br>
- pmovzxbw m2, m1<br>
<br>
- punpckhbw m1, m0<br>
<br>
- movu [t3], m2<br>
<br>
- movu [t3 + 16], m1<br>
<br>
-<br>
<br>
- add t3, t6<br>
<br>
- add t4, t7<br>
<br>
- add t0, t5<br>
<br>
- lea t1, [t1 + t5 * 2]<br>
<br>
- add t2, t5<br>
<br>
-<br>
<br>
- dec t8d<br>
<br>
- jnz .loop<br>
<br>
- RET<br>
<br>
-<br>
<br>
-<br>
<br>
-INIT_XMM sse4<br>
<br>
-cglobal calcRecons32<br>
<br>
-%if ARCH_X86_64 == 1<br>
<br>
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<br>
<br>
- PROLOGUE 6,9,7<br>
<br>
-%else<br>
<br>
- DECLARE_REG_TMP 0,1,2,3,4,5<br>
<br>
- PROLOGUE 6,7,7<br>
<br>
- %define t6 r6m<br>
<br>
- %define t6d r6d<br>
<br>
- %define t7 r7m<br>
<br>
- %define t8d r6d<br>
<br>
-%endif<br>
<br>
-<br>
<br>
- mov t6d, r6m<br>
<br>
-%if ARCH_X86_64 == 0<br>
<br>
- add t6d, t6d<br>
<br>
- mov r6m, t6d<br>
<br>
-%else<br>
<br>
- mov r5d, r5m<br>
<br>
- mov r7d, r7m<br>
<br>
- add t6d, t6d<br>
<br>
-%endif<br>
<br>
-<br>
<br>
- pxor m0, m0<br>
<br>
- mov t8d, 32<br>
<br>
-.loop:<br>
<br>
- movu m2, [t0]<br>
<br>
- movu m4, [t0 + 16]<br>
<br>
- pmovzxbw m1, m2<br>
<br>
- punpckhbw m2, m0<br>
<br>
- pmovzxbw m3, m4<br>
<br>
- punpckhbw m4, m0<br>
<br>
-<br>
<br>
- movu m5, [t1 + 0 * 16]<br>
<br>
- movu m6, [t1 + 1 * 16]<br>
<br>
- paddw m1, m5<br>
<br>
- paddw m2, m6<br>
<br>
- packuswb m1, m2<br>
<br>
-<br>
<br>
- movu m5, [t1 + 2 * 16]<br>
<br>
- movu m6, [t1 + 3 * 16]<br>
<br>
- paddw m3, m5<br>
<br>
- paddw m4, m6<br>
<br>
- packuswb m3, m4<br>
<br>
-<br>
<br>
- ; store recon[] and recipred[]<br>
<br>
- movu [t2], m1<br>
<br>
- movu [t2 + 16], m3<br>
<br>
- movu [t4], m1<br>
<br>
- movu [t4 + 16], m3<br>
<br>
-<br>
<br>
- ; store recqt[]<br>
<br>
- pmovzxbw m2, m1<br>
<br>
- punpckhbw m1, m0<br>
<br>
- movu [t3 + 0 * 16], m2<br>
<br>
- movu [t3 + 1 * 16], m1<br>
<br>
- pmovzxbw m4, m3<br>
<br>
- punpckhbw m3, m0<br>
<br>
- movu [t3 + 2 * 16], m4<br>
<br>
- movu [t3 + 3 * 16], m3<br>
<br>
-<br>
<br>
- add t3, t6<br>
<br>
- add t4, t7<br>
<br>
- add t0, t5<br>
<br>
- lea t1, [t1 + t5 * 2]<br>
<br>
- add t2, t5<br>
<br>
-<br>
<br>
- dec t8d<br>
<br>
- jnz .loop<br>
<br>
- RET<br>
<br>
-<br>
<br>
-<br>
<br>
-INIT_XMM sse4<br>
<br>
-cglobal calcRecons64<br>
<br>
-%if ARCH_X86_64 == 1<br>
<br>
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<br>
<br>
- PROLOGUE 6,9,7<br>
<br>
-%else<br>
<br>
- DECLARE_REG_TMP 0,1,2,3,4,5<br>
<br>
- PROLOGUE 6,7,7<br>
<br>
- %define t6 r6m<br>
<br>
- %define t6d r6d<br>
<br>
- %define t7 r7m<br>
<br>
- %define t8d r6d<br>
<br>
-%endif<br>
<br>
-<br>
<br>
- mov t6d, r6m<br>
<br>
-%if ARCH_X86_64 == 0<br>
<br>
- add t6d, t6d<br>
<br>
- mov r6m, t6d<br>
<br>
-%else<br>
<br>
- mov r5d, r5m<br>
<br>
- mov r7d, r7m<br>
<br>
- add t6d, t6d<br>
<br>
-%endif<br>
<br>
-<br>
<br>
- pxor m0, m0<br>
<br>
- mov t8d, 64<br>
<br>
-.loop:<br>
<br>
- ; left 32 pixel<br>
<br>
- movu m2, [t0 + 0 * 16]<br>
<br>
- movu m4, [t0 + 1 * 16]<br>
<br>
- pmovzxbw m1, m2<br>
<br>
- punpckhbw m2, m0<br>
<br>
- pmovzxbw m3, m4<br>
<br>
- punpckhbw m4, m0<br>
<br>
-<br>
<br>
- movu m5, [t1 + 0 * 16]<br>
<br>
- movu m6, [t1 + 1 * 16]<br>
<br>
- paddw m1, m5<br>
<br>
- paddw m2, m6<br>
<br>
- packuswb m1, m2<br>
<br>
-<br>
<br>
- movu m5, [t1 + 2 * 16]<br>
<br>
- movu m6, [t1 + 3 * 16]<br>
<br>
- paddw m3, m5<br>
<br>
- paddw m4, m6<br>
<br>
- packuswb m3, m4<br>
<br>
-<br>
<br>
- ; store recon[] and recipred[]<br>
<br>
- movu [t2 + 0 * 16], m1<br>
<br>
- movu [t2 + 1 * 16], m3<br>
<br>
- movu [t4 + 0 * 16], m1<br>
<br>
- movu [t4 + 1 * 16], m3<br>
<br>
-<br>
<br>
- ; store recqt[]<br>
<br>
- pmovzxbw m2, m1<br>
<br>
- punpckhbw m1, m0<br>
<br>
- movu [t3 + 0 * 16], m2<br>
<br>
- movu [t3 + 1 * 16], m1<br>
<br>
- pmovzxbw m4, m3<br>
<br>
- punpckhbw m3, m0<br>
<br>
- movu [t3 + 2 * 16], m4<br>
<br>
- movu [t3 + 3 * 16], m3<br>
<br>
-<br>
<br>
- ; right 32 pixel<br>
<br>
- movu m2, [t0 + 2 * 16]<br>
<br>
- movu m4, [t0 + 3 * 16]<br>
<br>
- pmovzxbw m1, m2<br>
<br>
- punpckhbw m2, m0<br>
<br>
- pmovzxbw m3, m4<br>
<br>
- punpckhbw m4, m0<br>
<br>
-<br>
<br>
- movu m5, [t1 + 4 * 16]<br>
<br>
- movu m6, [t1 + 5 * 16]<br>
<br>
- paddw m1, m5<br>
<br>
- paddw m2, m6<br>
<br>
- packuswb m1, m2<br>
<br>
-<br>
<br>
- movu m5, [t1 + 6 * 16]<br>
<br>
- movu m6, [t1 + 7 * 16]<br>
<br>
- paddw m3, m5<br>
<br>
- paddw m4, m6<br>
<br>
- packuswb m3, m4<br>
<br>
-<br>
<br>
- ; store recon[] and recipred[]<br>
<br>
- movu [t2 + 2 * 16], m1<br>
<br>
- movu [t2 + 3 * 16], m3<br>
<br>
- movu [t4 + 2 * 16], m1<br>
<br>
- movu [t4 + 3 * 16], m3<br>
<br>
-<br>
<br>
- ; store recqt[]<br>
<br>
- pmovzxbw m2, m1<br>
<br>
- punpckhbw m1, m0<br>
<br>
- movu [t3 + 4 * 16], m2<br>
<br>
- movu [t3 + 5 * 16], m1<br>
<br>
- pmovzxbw m4, m3<br>
<br>
- punpckhbw m3, m0<br>
<br>
- movu [t3 + 6 * 16], m4<br>
<br>
- movu [t3 + 7 * 16], m3<br>
<br>
-<br>
<br>
- add t3, t6<br>
<br>
- add t4, t7<br>
<br>
- add t0, t5<br>
<br>
- lea t1, [t1 + t5 * 2]<br>
<br>
- add t2, t5<br>
<br>
-<br>
<br>
- dec t8d<br>
<br>
- jnz .loop<br>
<br>
- RET<br>
<br>
+;*****************************************************************************<br>
+;* Copyright (C) 2013 x265 project<br>
+;*<br>
+;* Authors: Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>> <<a href="mailto:min.chen@multicorewareinc.com">min.chen@multicorewareinc.com</a>><br>
+;*<br>
+;* This program is free software; you can redistribute it and/or modify<br>
+;* it under the terms of the GNU General Public License as published by<br>
+;* the Free Software Foundation; either version 2 of the License, or<br>
+;* (at your option) any later version.<br>
+;*<br>
+;* This program is distributed in the hope that it will be useful,<br>
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+;* GNU General Public License for more details.<br>
+;*<br>
+;* You should have received a copy of the GNU General Public License<br>
+;* along with this program; if not, write to the Free Software<br>
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+;*<br>
+;* This program is also available under a commercial proprietary license.<br>
+;* For more information, contact us at <a href="mailto:licensing@multicorewareinc.com">licensing@multicorewareinc.com</a>.<br>
+;*****************************************************************************/<br>
+<br>
+%include "x86inc.asm"<br>
+%include "x86util.asm"<br>
+<br>
+SECTION_RODATA 32<br>
+<br>
+SECTION .text<br>
+<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM sse2<br>
+cglobal cvt32to16_shr, 5, 7, 1, dst, src, stride<br>
+%define rnd m7<br>
+%define shift m6<br>
+<br>
+ ; make shift<br>
+ mov r5d, r3m<br>
+ movd shift, r5d<br>
+<br>
+ ; make round<br>
+ dec r5<br>
+ xor r6, r6<br>
+ bts r6, r5<br>
+<br>
+ movd rnd, r6d<br>
+ pshufd rnd, rnd, 0<br>
+<br>
+ ; register alloc<br>
+ ; r0 - dst<br>
+ ; r1 - src<br>
+ ; r2 - stride * 2 (short*)<br>
+ ; r3 - lx<br>
+ ; r4 - size<br>
+ ; r5 - ly<br>
+ ; r6 - diff<br>
+ lea r2, [r2 * 2]<br>
+<br>
+ mov r4d, r4m<br>
+ mov r5, r4<br>
+ mov r6, r2<br>
+ sub r6, r4<br>
+ lea r6, [r6 * 2]<br>
+<br>
+ shr r5, 1<br>
+.loop_row:<br>
+<br>
+ mov r3, r4<br>
+ shr r3, 2<br>
+.loop_col:<br>
+ ; row 0<br>
+ movu m0, [r1]<br>
+ paddd m0, rnd<br>
+ psrad m0, shift<br>
+ packssdw m0, m0<br>
+ movh [r0], m0<br>
+<br>
+ ; row 1<br>
+ movu m0, [r1 + r4 * 4]<br>
+ paddd m0, rnd<br>
+ psrad m0, shift<br>
+ packssdw m0, m0<br>
+ movh [r0 + r2], m0<br>
+<br>
+ ; move col pointer<br>
+ add r1, 16<br>
+ add r0, 8<br>
+<br>
+ dec r3<br>
+ jg .loop_col<br>
+<br>
+ ; update pointer<br>
+ lea r1, [r1 + r4 * 4]<br>
+ add r0, r6<br>
+<br>
+ ; end of loop_row<br>
+ dec r5<br>
+ jg .loop_row<br>
+<br>
+ RET<br>
+<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM sse2<br>
+cglobal calcRecons4<br>
+%if ARCH_X86_64 == 1<br>
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<br>
+ PROLOGUE 6,9,4<br>
+%else<br>
+ DECLARE_REG_TMP 0,1,2,3,4,5<br>
+ PROLOGUE 6,7,4<br>
+ %define t6 r6m<br>
+ %define t6d r6d<br>
+ %define t7 r7m<br>
+ %define t8d r6d<br>
+%endif<br>
+<br>
+ mov t6d, r6m<br>
+%if ARCH_X86_64 == 0<br>
+ add t6d, t6d<br>
+ mov r6m, t6d<br>
+%else<br>
+ mov r5d, r5m<br>
+ mov r7d, r7m<br>
+ add t6d, t6d<br>
+%endif<br>
+<br>
+ pxor m0, m0<br>
+ mov t8d, 4/2<br>
+.loop:<br>
+ movd m1, [t0]<br>
+ movd m2, [t0 + t5]<br>
+ punpckldq m1, m2<br>
+ punpcklbw m1, m0<br>
+ movh m2, [t1]<br>
+ movh m3, [t1 + t5 * 2]<br>
+ punpcklqdq m2, m3<br>
+ paddw m1, m2<br>
+ packuswb m1, m1<br>
+<br>
+ ; store recon[] and recipred[]<br>
+ movd [t2], m1<br>
+ movd [t4], m1<br>
+ add t4, t7<br>
+ pshufd m2, m1, 1<br>
+ movd [t2 + t5], m2<br>
+ movd [t4], m2<br>
+ add t4, t7<br>
+<br>
+ ; store recqt[]<br>
+ punpcklbw m1, m0<br>
+ movlps [t3], m1<br>
+ add t3, t6<br>
+ movhps [t3], m1<br>
+ add t3, t6<br>
+<br>
+ lea t0, [t0 + t5 * 2]<br>
+ lea t1, [t1 + t5 * 4]<br>
+ lea t2, [t2 + t5 * 2]<br>
+<br>
+ dec t8d<br>
+ jnz .loop<br>
+ RET<br>
+<br>
+<br>
+INIT_XMM sse2<br>
+cglobal calcRecons8<br>
+%if ARCH_X86_64 == 1<br>
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<br>
+ PROLOGUE 6,9,5<br>
+%else<br>
+ DECLARE_REG_TMP 0,1,2,3,4,5<br>
+ PROLOGUE 6,7,5<br>
+ %define t6 r6m<br>
+ %define t6d r6d<br>
+ %define t7 r7m<br>
+ %define t8d r6d<br>
+%endif<br>
+<br>
+ mov t6d, r6m<br>
+%if ARCH_X86_64 == 0<br>
+ add t6d, t6d<br>
+ mov r6m, t6d<br>
+%else<br>
+ mov r5d, r5m<br>
+ mov r7d, r7m<br>
+ add t6d, t6d<br>
+%endif<br>
+<br>
+ pxor m0, m0<br>
+ mov t8d, 8/2<br>
+.loop:<br>
+ movh m1, [t0]<br>
+ movh m2, [t0 + t5]<br>
+ punpcklbw m1, m0<br>
+ punpcklbw m2, m0<br>
+ movu m3, [t1]<br>
+ movu m4, [t1 + t5 * 2]<br>
+ paddw m1, m3<br>
+ paddw m2, m4<br>
+ packuswb m1, m2<br>
+<br>
+ ; store recon[] and recipred[]<br>
+ movlps [t2], m1<br>
+ movhps [t2 + t5], m1<br>
+ movlps [t4], m1<br>
+%if ARCH_X86_64 == 0<br>
+ add t4, t7<br>
+ movhps [t4], m1<br>
+ add t4, t7<br>
+%else<br>
+ movhps [t4 + t7], m1<br>
+ lea t4, [t4 + t7 * 2]<br>
+%endif<br>
+<br>
+ ; store recqt[]<br>
+ punpcklbw m2, m1, m0<br>
+ punpckhbw m1, m0<br>
+ movu [t3], m2<br>
+ add t3, t6<br>
+ movu [t3], m1<br>
+ add t3, t6<br>
+<br>
+ lea t0, [t0 + t5 * 2]<br>
+ lea t1, [t1 + t5 * 4]<br>
+ lea t2, [t2 + t5 * 2]<br>
+<br>
+ dec t8d<br>
+ jnz .loop<br>
+ RET<br>
+<br>
+<br>
+INIT_XMM sse4<br>
+cglobal calcRecons16<br>
+%if ARCH_X86_64 == 1<br>
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<br>
+ PROLOGUE 6,9,3<br>
+%else<br>
+ DECLARE_REG_TMP 0,1,2,3,4,5<br>
+ PROLOGUE 6,7,3<br>
+ %define t6 r6m<br>
+ %define t6d r6d<br>
+ %define t7 r7m<br>
+ %define t8d r6d<br>
+%endif<br>
+<br>
+ mov t6d, r6m<br>
+%if ARCH_X86_64 == 0<br>
+ add t6d, t6d<br>
+ mov r6m, t6d<br>
+%else<br>
+ mov r5d, r5m<br>
+ mov r7d, r7m<br>
+ add t6d, t6d<br>
+%endif<br>
+<br>
+ pxor m0, m0<br>
+ mov t8d, 16<br>
+.loop:<br>
+ movu m2, [t0]<br>
+ pmovzxbw m1, m2<br>
+ punpckhbw m2, m0<br>
+ paddw m1, [t1]<br>
+ paddw m2, [t1 + 16]<br>
+ packuswb m1, m2<br>
+<br>
+ ; store recon[] and recipred[]<br>
+ movu [t2], m1<br>
+ movu [t4], m1<br>
+<br>
+ ; store recqt[]<br>
+ pmovzxbw m2, m1<br>
+ punpckhbw m1, m0<br>
+ movu [t3], m2<br>
+ movu [t3 + 16], m1<br>
+<br>
+ add t3, t6<br>
+ add t4, t7<br>
+ add t0, t5<br>
+ lea t1, [t1 + t5 * 2]<br>
+ add t2, t5<br>
+<br>
+ dec t8d<br>
+ jnz .loop<br>
+ RET<br>
+<br>
+<br>
+INIT_XMM sse4<br>
+cglobal calcRecons32<br>
+%if ARCH_X86_64 == 1<br>
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<br>
+ PROLOGUE 6,9,5<br>
+%else<br>
+ DECLARE_REG_TMP 0,1,2,3,4,5<br>
+ PROLOGUE 6,7,5<br>
+ %define t6 r6m<br>
+ %define t6d r6d<br>
+ %define t7 r7m<br>
+ %define t8d r6d<br>
+%endif<br>
+<br>
+ mov t6d, r6m<br>
+%if ARCH_X86_64 == 0<br>
+ add t6d, t6d<br>
+ mov r6m, t6d<br>
+%else<br>
+ mov r5d, r5m<br>
+ mov r7d, r7m<br>
+ add t6d, t6d<br>
+%endif<br>
+<br>
+ pxor m0, m0<br>
+ mov t8d, 32<br>
+.loop:<br>
+ movu m2, [t0]<br>
+ movu m4, [t0 + 16]<br>
+ pmovzxbw m1, m2<br>
+ punpckhbw m2, m0<br>
+ pmovzxbw m3, m4<br>
+ punpckhbw m4, m0<br>
+<br>
+ paddw m1, [t1 + 0 * 16]<br>
+ paddw m2, [t1 + 1 * 16]<br>
+ packuswb m1, m2<br>
+<br>
+ paddw m3, [t1 + 2 * 16]<br>
+ paddw m4, [t1 + 3 * 16]<br>
+ packuswb m3, m4<br>
+<br>
+ ; store recon[] and recipred[]<br>
+ movu [t2], m1<br>
+ movu [t2 + 16], m3<br>
+ movu [t4], m1<br>
+ movu [t4 + 16], m3<br>
+<br>
+ ; store recqt[]<br>
+ pmovzxbw m2, m1<br>
+ punpckhbw m1, m0<br>
+ movu [t3 + 0 * 16], m2<br>
+ movu [t3 + 1 * 16], m1<br>
+ pmovzxbw m4, m3<br>
+ punpckhbw m3, m0<br>
+ movu [t3 + 2 * 16], m4<br>
+ movu [t3 + 3 * 16], m3<br>
+<br>
+ add t3, t6<br>
+ add t4, t7<br>
+ add t0, t5<br>
+ lea t1, [t1 + t5 * 2]<br>
+ add t2, t5<br>
+<br>
+ dec t8d<br>
+ jnz .loop<br>
+ RET<br>
+<br>
+<br>
+INIT_XMM sse4<br>
+cglobal calcRecons64<br>
+%if ARCH_X86_64 == 1<br>
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<br>
+ PROLOGUE 6,9,7<br>
+%else<br>
+ DECLARE_REG_TMP 0,1,2,3,4,5<br>
+ PROLOGUE 6,7,7<br>
+ %define t6 r6m<br>
+ %define t6d r6d<br>
+ %define t7 r7m<br>
+ %define t8d r6d<br>
+%endif<br>
+<br>
+ mov t6d, r6m<br>
+%if ARCH_X86_64 == 0<br>
+ add t6d, t6d<br>
+ mov r6m, t6d<br>
+%else<br>
+ mov r5d, r5m<br>
+ mov r7d, r7m<br>
+ add t6d, t6d<br>
+%endif<br>
+<br>
+ pxor m0, m0<br>
+ mov t8d, 64<br>
+.loop:<br>
+ ; left 32 pixel<br>
+ movu m2, [t0 + 0 * 16]<br>
+ movu m4, [t0 + 1 * 16]<br>
+ pmovzxbw m1, m2<br>
+ punpckhbw m2, m0<br>
+ pmovzxbw m3, m4<br>
+ punpckhbw m4, m0<br>
+<br>
+ movu m5, [t1 + 0 * 16]<br>
+ movu m6, [t1 + 1 * 16]<br>
+ paddw m1, m5<br>
+ paddw m2, m6<br>
+ packuswb m1, m2<br>
+<br>
+ movu m5, [t1 + 2 * 16]<br>
+ movu m6, [t1 + 3 * 16]<br>
+ paddw m3, m5<br>
+ paddw m4, m6<br>
+ packuswb m3, m4<br>
+<br>
+ ; store recon[] and recipred[]<br>
+ movu [t2 + 0 * 16], m1<br>
+ movu [t2 + 1 * 16], m3<br>
+ movu [t4 + 0 * 16], m1<br>
+ movu [t4 + 1 * 16], m3<br>
+<br>
+ ; store recqt[]<br>
+ pmovzxbw m2, m1<br>
+ punpckhbw m1, m0<br>
+ movu [t3 + 0 * 16], m2<br>
+ movu [t3 + 1 * 16], m1<br>
+ pmovzxbw m4, m3<br>
+ punpckhbw m3, m0<br>
+ movu [t3 + 2 * 16], m4<br>
+ movu [t3 + 3 * 16], m3<br>
+<br>
+ ; right 32 pixel<br>
+ movu m2, [t0 + 2 * 16]<br>
+ movu m4, [t0 + 3 * 16]<br>
+ pmovzxbw m1, m2<br>
+ punpckhbw m2, m0<br>
+ pmovzxbw m3, m4<br>
+ punpckhbw m4, m0<br>
+<br>
+ movu m5, [t1 + 4 * 16]<br>
+ movu m6, [t1 + 5 * 16]<br>
+ paddw m1, m5<br>
+ paddw m2, m6<br>
+ packuswb m1, m2<br>
+<br>
+ movu m5, [t1 + 6 * 16]<br>
+ movu m6, [t1 + 7 * 16]<br>
+ paddw m3, m5<br>
+ paddw m4, m6<br>
+ packuswb m3, m4<br>
+<br>
+ ; store recon[] and recipred[]<br>
+ movu [t2 + 2 * 16], m1<br>
+ movu [t2 + 3 * 16], m3<br>
+ movu [t4 + 2 * 16], m1<br>
+ movu [t4 + 3 * 16], m3<br>
+<br>
+ ; store recqt[]<br>
+ pmovzxbw m2, m1<br>
+ punpckhbw m1, m0<br>
+ movu [t3 + 4 * 16], m2<br>
+ movu [t3 + 5 * 16], m1<br>
+ pmovzxbw m4, m3<br>
+ punpckhbw m3, m0<br>
+ movu [t3 + 6 * 16], m4<br>
+ movu [t3 + 7 * 16], m3<br>
+<br>
+ add t3, t6<br>
+ add t4, t7<br>
+ add t0, t5<br>
+ lea t1, [t1 + t5 * 2]<br>
+ add t2, t5<br>
+<br>
+ dec t8d<br>
+ jnz .loop<br>
+ RET<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>