[x265] [PATCH 1 of 3] asm: residual buffer is alignment to size, so we can use alignment load instruction
Min Chen
chenm003 at 163.com
Fri Nov 15 06:18:46 CET 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1384492636 -28800
# Node ID 9592525e376f4b41d4fde000ae77814a00b06822
# Parent ee42f57411ae746095dd36e36064145ed869d73c
asm: residual buffer is alignment to size, so we can use alignment load instruction
diff -r ee42f57411ae -r 9592525e376f source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Nov 14 13:38:07 2013 -0600
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Nov 15 13:17:16 2013 +0800
@@ -502,6 +502,8 @@
}
//===== reconstruction =====
+ assert(((uint32_t)residual & (width - 1)) == 0);
+ assert(width <= 32);
primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
//===== update distortion =====
@@ -636,6 +638,8 @@
}
//===== reconstruction =====
+ assert(((uint32_t)residual & (width - 1)) == 0);
+ assert(width <= 32);
primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE / 2, reconIPredStride);
//===== update distortion =====
diff -r ee42f57411ae -r 9592525e376f source/common/x86/pixel-util.asm
--- a/source/common/x86/pixel-util.asm Thu Nov 14 13:38:07 2013 -0600
+++ b/source/common/x86/pixel-util.asm Fri Nov 15 13:17:16 2013 +0800
@@ -1,475 +1,469 @@
-;*****************************************************************************
-;* Copyright (C) 2013 x265 project
-;*
-;* Authors: Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing at multicorewareinc.com.
-;*****************************************************************************/
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION_RODATA 32
-
-SECTION .text
-
-
-;-----------------------------------------------------------------------------
-; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal cvt32to16_shr, 5, 7, 1, dst, src, stride
-%define rnd m7
-%define shift m6
-
- ; make shift
- mov r5d, r3m
- movd shift, r5d
-
- ; make round
- dec r5
- xor r6, r6
- bts r6, r5
-
- movd rnd, r6d
- pshufd rnd, rnd, 0
-
- ; register alloc
- ; r0 - dst
- ; r1 - src
- ; r2 - stride * 2 (short*)
- ; r3 - lx
- ; r4 - size
- ; r5 - ly
- ; r6 - diff
- lea r2, [r2 * 2]
-
- mov r4d, r4m
- mov r5, r4
- mov r6, r2
- sub r6, r4
- lea r6, [r6 * 2]
-
- shr r5, 1
-.loop_row:
-
- mov r3, r4
- shr r3, 2
-.loop_col:
- ; row 0
- movu m0, [r1]
- paddd m0, rnd
- psrad m0, shift
- packssdw m0, m0
- movh [r0], m0
-
- ; row 1
- movu m0, [r1 + r4 * 4]
- paddd m0, rnd
- psrad m0, shift
- packssdw m0, m0
- movh [r0 + r2], m0
-
- ; move col pointer
- add r1, 16
- add r0, 8
-
- dec r3
- jg .loop_col
-
- ; update pointer
- lea r1, [r1 + r4 * 4]
- add r0, r6
-
- ; end of loop_row
- dec r5
- jg .loop_row
-
- RET
-
-
-;-----------------------------------------------------------------------------
-; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal calcRecons4
-%if ARCH_X86_64 == 1
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,4
-%else
- DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,4
- %define t6 r6m
- %define t6d r6d
- %define t7 r7m
- %define t8d r6d
-%endif
-
- mov t6d, r6m
-%if ARCH_X86_64 == 0
- add t6d, t6d
- mov r6m, t6d
-%else
- mov r5d, r5m
- mov r7d, r7m
- add t6d, t6d
-%endif
-
- pxor m0, m0
- mov t8d, 4/2
-.loop:
- movd m1, [t0]
- movd m2, [t0 + t5]
- punpckldq m1, m2
- punpcklbw m1, m0
- movh m2, [t1]
- movh m3, [t1 + t5 * 2]
- punpcklqdq m2, m3
- paddw m1, m2
- packuswb m1, m1
-
- ; store recon[] and recipred[]
- movd [t2], m1
- movd [t4], m1
- add t4, t7
- pshufd m2, m1, 1
- movd [t2 + t5], m2
- movd [t4], m2
- add t4, t7
-
- ; store recqt[]
- punpcklbw m1, m0
- movlps [t3], m1
- add t3, t6
- movhps [t3], m1
- add t3, t6
-
- lea t0, [t0 + t5 * 2]
- lea t1, [t1 + t5 * 4]
- lea t2, [t2 + t5 * 2]
-
- dec t8d
- jnz .loop
- RET
-
-
-INIT_XMM sse2
-cglobal calcRecons8
-%if ARCH_X86_64 == 1
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,5
-%else
- DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,5
- %define t6 r6m
- %define t6d r6d
- %define t7 r7m
- %define t8d r6d
-%endif
-
- mov t6d, r6m
-%if ARCH_X86_64 == 0
- add t6d, t6d
- mov r6m, t6d
-%else
- mov r5d, r5m
- mov r7d, r7m
- add t6d, t6d
-%endif
-
- pxor m0, m0
- mov t8d, 8/2
-.loop:
- movh m1, [t0]
- movh m2, [t0 + t5]
- punpcklbw m1, m0
- punpcklbw m2, m0
- movu m3, [t1]
- movu m4, [t1 + t5 * 2]
- paddw m1, m3
- paddw m2, m4
- packuswb m1, m2
-
- ; store recon[] and recipred[]
- movlps [t2], m1
- movhps [t2 + t5], m1
- movlps [t4], m1
-%if ARCH_X86_64 == 0
- add t4, t7
- movhps [t4], m1
- add t4, t7
-%else
- movhps [t4 + t7], m1
- lea t4, [t4 + t7 * 2]
-%endif
-
- ; store recqt[]
- punpcklbw m2, m1, m0
- punpckhbw m1, m0
- movu [t3], m2
- add t3, t6
- movu [t3], m1
- add t3, t6
-
- lea t0, [t0 + t5 * 2]
- lea t1, [t1 + t5 * 4]
- lea t2, [t2 + t5 * 2]
-
- dec t8d
- jnz .loop
- RET
-
-
-INIT_XMM sse4
-cglobal calcRecons16
-%if ARCH_X86_64 == 1
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,5
-%else
- DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,5
- %define t6 r6m
- %define t6d r6d
- %define t7 r7m
- %define t8d r6d
-%endif
-
- mov t6d, r6m
-%if ARCH_X86_64 == 0
- add t6d, t6d
- mov r6m, t6d
-%else
- mov r5d, r5m
- mov r7d, r7m
- add t6d, t6d
-%endif
-
- pxor m0, m0
- mov t8d, 16
-.loop:
- movu m2, [t0]
- pmovzxbw m1, m2
- punpckhbw m2, m0
- movu m3, [t1]
- movu m4, [t1 + 16]
- paddw m1, m3
- paddw m2, m4
- packuswb m1, m2
-
- ; store recon[] and recipred[]
- movu [t2], m1
- movu [t4], m1
-
- ; store recqt[]
- pmovzxbw m2, m1
- punpckhbw m1, m0
- movu [t3], m2
- movu [t3 + 16], m1
-
- add t3, t6
- add t4, t7
- add t0, t5
- lea t1, [t1 + t5 * 2]
- add t2, t5
-
- dec t8d
- jnz .loop
- RET
-
-
-INIT_XMM sse4
-cglobal calcRecons32
-%if ARCH_X86_64 == 1
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,7
-%else
- DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,7
- %define t6 r6m
- %define t6d r6d
- %define t7 r7m
- %define t8d r6d
-%endif
-
- mov t6d, r6m
-%if ARCH_X86_64 == 0
- add t6d, t6d
- mov r6m, t6d
-%else
- mov r5d, r5m
- mov r7d, r7m
- add t6d, t6d
-%endif
-
- pxor m0, m0
- mov t8d, 32
-.loop:
- movu m2, [t0]
- movu m4, [t0 + 16]
- pmovzxbw m1, m2
- punpckhbw m2, m0
- pmovzxbw m3, m4
- punpckhbw m4, m0
-
- movu m5, [t1 + 0 * 16]
- movu m6, [t1 + 1 * 16]
- paddw m1, m5
- paddw m2, m6
- packuswb m1, m2
-
- movu m5, [t1 + 2 * 16]
- movu m6, [t1 + 3 * 16]
- paddw m3, m5
- paddw m4, m6
- packuswb m3, m4
-
- ; store recon[] and recipred[]
- movu [t2], m1
- movu [t2 + 16], m3
- movu [t4], m1
- movu [t4 + 16], m3
-
- ; store recqt[]
- pmovzxbw m2, m1
- punpckhbw m1, m0
- movu [t3 + 0 * 16], m2
- movu [t3 + 1 * 16], m1
- pmovzxbw m4, m3
- punpckhbw m3, m0
- movu [t3 + 2 * 16], m4
- movu [t3 + 3 * 16], m3
-
- add t3, t6
- add t4, t7
- add t0, t5
- lea t1, [t1 + t5 * 2]
- add t2, t5
-
- dec t8d
- jnz .loop
- RET
-
-
-INIT_XMM sse4
-cglobal calcRecons64
-%if ARCH_X86_64 == 1
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,7
-%else
- DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,7
- %define t6 r6m
- %define t6d r6d
- %define t7 r7m
- %define t8d r6d
-%endif
-
- mov t6d, r6m
-%if ARCH_X86_64 == 0
- add t6d, t6d
- mov r6m, t6d
-%else
- mov r5d, r5m
- mov r7d, r7m
- add t6d, t6d
-%endif
-
- pxor m0, m0
- mov t8d, 64
-.loop:
- ; left 32 pixel
- movu m2, [t0 + 0 * 16]
- movu m4, [t0 + 1 * 16]
- pmovzxbw m1, m2
- punpckhbw m2, m0
- pmovzxbw m3, m4
- punpckhbw m4, m0
-
- movu m5, [t1 + 0 * 16]
- movu m6, [t1 + 1 * 16]
- paddw m1, m5
- paddw m2, m6
- packuswb m1, m2
-
- movu m5, [t1 + 2 * 16]
- movu m6, [t1 + 3 * 16]
- paddw m3, m5
- paddw m4, m6
- packuswb m3, m4
-
- ; store recon[] and recipred[]
- movu [t2 + 0 * 16], m1
- movu [t2 + 1 * 16], m3
- movu [t4 + 0 * 16], m1
- movu [t4 + 1 * 16], m3
-
- ; store recqt[]
- pmovzxbw m2, m1
- punpckhbw m1, m0
- movu [t3 + 0 * 16], m2
- movu [t3 + 1 * 16], m1
- pmovzxbw m4, m3
- punpckhbw m3, m0
- movu [t3 + 2 * 16], m4
- movu [t3 + 3 * 16], m3
-
- ; right 32 pixel
- movu m2, [t0 + 2 * 16]
- movu m4, [t0 + 3 * 16]
- pmovzxbw m1, m2
- punpckhbw m2, m0
- pmovzxbw m3, m4
- punpckhbw m4, m0
-
- movu m5, [t1 + 4 * 16]
- movu m6, [t1 + 5 * 16]
- paddw m1, m5
- paddw m2, m6
- packuswb m1, m2
-
- movu m5, [t1 + 6 * 16]
- movu m6, [t1 + 7 * 16]
- paddw m3, m5
- paddw m4, m6
- packuswb m3, m4
-
- ; store recon[] and recipred[]
- movu [t2 + 2 * 16], m1
- movu [t2 + 3 * 16], m3
- movu [t4 + 2 * 16], m1
- movu [t4 + 3 * 16], m3
-
- ; store recqt[]
- pmovzxbw m2, m1
- punpckhbw m1, m0
- movu [t3 + 4 * 16], m2
- movu [t3 + 5 * 16], m1
- pmovzxbw m4, m3
- punpckhbw m3, m0
- movu [t3 + 6 * 16], m4
- movu [t3 + 7 * 16], m3
-
- add t3, t6
- add t4, t7
- add t0, t5
- lea t1, [t1 + t5 * 2]
- add t2, t5
-
- dec t8d
- jnz .loop
- RET
+;*****************************************************************************
+;* Copyright (C) 2013 x265 project
+;*
+;* Authors: Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing at multicorewareinc.com.
+;*****************************************************************************/
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA 32
+
+SECTION .text
+
+
+;-----------------------------------------------------------------------------
+; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cvt32to16_shr, 5, 7, 1, dst, src, stride
+%define rnd m7
+%define shift m6
+
+ ; make shift
+ mov r5d, r3m
+ movd shift, r5d
+
+ ; make round
+ dec r5
+ xor r6, r6
+ bts r6, r5
+
+ movd rnd, r6d
+ pshufd rnd, rnd, 0
+
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - stride * 2 (short*)
+ ; r3 - lx
+ ; r4 - size
+ ; r5 - ly
+ ; r6 - diff
+ lea r2, [r2 * 2]
+
+ mov r4d, r4m
+ mov r5, r4
+ mov r6, r2
+ sub r6, r4
+ lea r6, [r6 * 2]
+
+ shr r5, 1
+.loop_row:
+
+ mov r3, r4
+ shr r3, 2
+.loop_col:
+ ; row 0
+ movu m0, [r1]
+ paddd m0, rnd
+ psrad m0, shift
+ packssdw m0, m0
+ movh [r0], m0
+
+ ; row 1
+ movu m0, [r1 + r4 * 4]
+ paddd m0, rnd
+ psrad m0, shift
+ packssdw m0, m0
+ movh [r0 + r2], m0
+
+ ; move col pointer
+ add r1, 16
+ add r0, 8
+
+ dec r3
+ jg .loop_col
+
+ ; update pointer
+ lea r1, [r1 + r4 * 4]
+ add r0, r6
+
+ ; end of loop_row
+ dec r5
+ jg .loop_row
+
+ RET
+
+
+;-----------------------------------------------------------------------------
+; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal calcRecons4
+%if ARCH_X86_64 == 1
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
+ PROLOGUE 6,9,4
+%else
+ DECLARE_REG_TMP 0,1,2,3,4,5
+ PROLOGUE 6,7,4
+ %define t6 r6m
+ %define t6d r6d
+ %define t7 r7m
+ %define t8d r6d
+%endif
+
+ mov t6d, r6m
+%if ARCH_X86_64 == 0
+ add t6d, t6d
+ mov r6m, t6d
+%else
+ mov r5d, r5m
+ mov r7d, r7m
+ add t6d, t6d
+%endif
+
+ pxor m0, m0
+ mov t8d, 4/2
+.loop:
+ movd m1, [t0]
+ movd m2, [t0 + t5]
+ punpckldq m1, m2
+ punpcklbw m1, m0
+ movh m2, [t1]
+ movh m3, [t1 + t5 * 2]
+ punpcklqdq m2, m3
+ paddw m1, m2
+ packuswb m1, m1
+
+ ; store recon[] and recipred[]
+ movd [t2], m1
+ movd [t4], m1
+ add t4, t7
+ pshufd m2, m1, 1
+ movd [t2 + t5], m2
+ movd [t4], m2
+ add t4, t7
+
+ ; store recqt[]
+ punpcklbw m1, m0
+ movlps [t3], m1
+ add t3, t6
+ movhps [t3], m1
+ add t3, t6
+
+ lea t0, [t0 + t5 * 2]
+ lea t1, [t1 + t5 * 4]
+ lea t2, [t2 + t5 * 2]
+
+ dec t8d
+ jnz .loop
+ RET
+
+
+INIT_XMM sse2
+cglobal calcRecons8
+%if ARCH_X86_64 == 1
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
+ PROLOGUE 6,9,5
+%else
+ DECLARE_REG_TMP 0,1,2,3,4,5
+ PROLOGUE 6,7,5
+ %define t6 r6m
+ %define t6d r6d
+ %define t7 r7m
+ %define t8d r6d
+%endif
+
+ mov t6d, r6m
+%if ARCH_X86_64 == 0
+ add t6d, t6d
+ mov r6m, t6d
+%else
+ mov r5d, r5m
+ mov r7d, r7m
+ add t6d, t6d
+%endif
+
+ pxor m0, m0
+ mov t8d, 8/2
+.loop:
+ movh m1, [t0]
+ movh m2, [t0 + t5]
+ punpcklbw m1, m0
+ punpcklbw m2, m0
+ movu m3, [t1]
+ movu m4, [t1 + t5 * 2]
+ paddw m1, m3
+ paddw m2, m4
+ packuswb m1, m2
+
+ ; store recon[] and recipred[]
+ movlps [t2], m1
+ movhps [t2 + t5], m1
+ movlps [t4], m1
+%if ARCH_X86_64 == 0
+ add t4, t7
+ movhps [t4], m1
+ add t4, t7
+%else
+ movhps [t4 + t7], m1
+ lea t4, [t4 + t7 * 2]
+%endif
+
+ ; store recqt[]
+ punpcklbw m2, m1, m0
+ punpckhbw m1, m0
+ movu [t3], m2
+ add t3, t6
+ movu [t3], m1
+ add t3, t6
+
+ lea t0, [t0 + t5 * 2]
+ lea t1, [t1 + t5 * 4]
+ lea t2, [t2 + t5 * 2]
+
+ dec t8d
+ jnz .loop
+ RET
+
+
+INIT_XMM sse4
+cglobal calcRecons16
+%if ARCH_X86_64 == 1
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
+ PROLOGUE 6,9,3
+%else
+ DECLARE_REG_TMP 0,1,2,3,4,5
+ PROLOGUE 6,7,3
+ %define t6 r6m
+ %define t6d r6d
+ %define t7 r7m
+ %define t8d r6d
+%endif
+
+ mov t6d, r6m
+%if ARCH_X86_64 == 0
+ add t6d, t6d
+ mov r6m, t6d
+%else
+ mov r5d, r5m
+ mov r7d, r7m
+ add t6d, t6d
+%endif
+
+ pxor m0, m0
+ mov t8d, 16
+.loop:
+ movu m2, [t0]
+ pmovzxbw m1, m2
+ punpckhbw m2, m0
+ paddw m1, [t1]
+ paddw m2, [t1 + 16]
+ packuswb m1, m2
+
+ ; store recon[] and recipred[]
+ movu [t2], m1
+ movu [t4], m1
+
+ ; store recqt[]
+ pmovzxbw m2, m1
+ punpckhbw m1, m0
+ movu [t3], m2
+ movu [t3 + 16], m1
+
+ add t3, t6
+ add t4, t7
+ add t0, t5
+ lea t1, [t1 + t5 * 2]
+ add t2, t5
+
+ dec t8d
+ jnz .loop
+ RET
+
+
+INIT_XMM sse4
+cglobal calcRecons32
+%if ARCH_X86_64 == 1
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
+ PROLOGUE 6,9,5
+%else
+ DECLARE_REG_TMP 0,1,2,3,4,5
+ PROLOGUE 6,7,5
+ %define t6 r6m
+ %define t6d r6d
+ %define t7 r7m
+ %define t8d r6d
+%endif
+
+ mov t6d, r6m
+%if ARCH_X86_64 == 0
+ add t6d, t6d
+ mov r6m, t6d
+%else
+ mov r5d, r5m
+ mov r7d, r7m
+ add t6d, t6d
+%endif
+
+ pxor m0, m0
+ mov t8d, 32
+.loop:
+ movu m2, [t0]
+ movu m4, [t0 + 16]
+ pmovzxbw m1, m2
+ punpckhbw m2, m0
+ pmovzxbw m3, m4
+ punpckhbw m4, m0
+
+ paddw m1, [t1 + 0 * 16]
+ paddw m2, [t1 + 1 * 16]
+ packuswb m1, m2
+
+ paddw m3, [t1 + 2 * 16]
+ paddw m4, [t1 + 3 * 16]
+ packuswb m3, m4
+
+ ; store recon[] and recipred[]
+ movu [t2], m1
+ movu [t2 + 16], m3
+ movu [t4], m1
+ movu [t4 + 16], m3
+
+ ; store recqt[]
+ pmovzxbw m2, m1
+ punpckhbw m1, m0
+ movu [t3 + 0 * 16], m2
+ movu [t3 + 1 * 16], m1
+ pmovzxbw m4, m3
+ punpckhbw m3, m0
+ movu [t3 + 2 * 16], m4
+ movu [t3 + 3 * 16], m3
+
+ add t3, t6
+ add t4, t7
+ add t0, t5
+ lea t1, [t1 + t5 * 2]
+ add t2, t5
+
+ dec t8d
+ jnz .loop
+ RET
+
+
+INIT_XMM sse4
+cglobal calcRecons64
+%if ARCH_X86_64 == 1
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
+ PROLOGUE 6,9,7
+%else
+ DECLARE_REG_TMP 0,1,2,3,4,5
+ PROLOGUE 6,7,7
+ %define t6 r6m
+ %define t6d r6d
+ %define t7 r7m
+ %define t8d r6d
+%endif
+
+ mov t6d, r6m
+%if ARCH_X86_64 == 0
+ add t6d, t6d
+ mov r6m, t6d
+%else
+ mov r5d, r5m
+ mov r7d, r7m
+ add t6d, t6d
+%endif
+
+ pxor m0, m0
+ mov t8d, 64
+.loop:
+ ; left 32 pixel
+ movu m2, [t0 + 0 * 16]
+ movu m4, [t0 + 1 * 16]
+ pmovzxbw m1, m2
+ punpckhbw m2, m0
+ pmovzxbw m3, m4
+ punpckhbw m4, m0
+
+ movu m5, [t1 + 0 * 16]
+ movu m6, [t1 + 1 * 16]
+ paddw m1, m5
+ paddw m2, m6
+ packuswb m1, m2
+
+ movu m5, [t1 + 2 * 16]
+ movu m6, [t1 + 3 * 16]
+ paddw m3, m5
+ paddw m4, m6
+ packuswb m3, m4
+
+ ; store recon[] and recipred[]
+ movu [t2 + 0 * 16], m1
+ movu [t2 + 1 * 16], m3
+ movu [t4 + 0 * 16], m1
+ movu [t4 + 1 * 16], m3
+
+ ; store recqt[]
+ pmovzxbw m2, m1
+ punpckhbw m1, m0
+ movu [t3 + 0 * 16], m2
+ movu [t3 + 1 * 16], m1
+ pmovzxbw m4, m3
+ punpckhbw m3, m0
+ movu [t3 + 2 * 16], m4
+ movu [t3 + 3 * 16], m3
+
+ ; right 32 pixel
+ movu m2, [t0 + 2 * 16]
+ movu m4, [t0 + 3 * 16]
+ pmovzxbw m1, m2
+ punpckhbw m2, m0
+ pmovzxbw m3, m4
+ punpckhbw m4, m0
+
+ movu m5, [t1 + 4 * 16]
+ movu m6, [t1 + 5 * 16]
+ paddw m1, m5
+ paddw m2, m6
+ packuswb m1, m2
+
+ movu m5, [t1 + 6 * 16]
+ movu m6, [t1 + 7 * 16]
+ paddw m3, m5
+ paddw m4, m6
+ packuswb m3, m4
+
+ ; store recon[] and recipred[]
+ movu [t2 + 2 * 16], m1
+ movu [t2 + 3 * 16], m3
+ movu [t4 + 2 * 16], m1
+ movu [t4 + 3 * 16], m3
+
+ ; store recqt[]
+ pmovzxbw m2, m1
+ punpckhbw m1, m0
+ movu [t3 + 4 * 16], m2
+ movu [t3 + 5 * 16], m1
+ pmovzxbw m4, m3
+ punpckhbw m3, m0
+ movu [t3 + 6 * 16], m4
+ movu [t3 + 7 * 16], m3
+
+ add t3, t6
+ add t4, t7
+ add t0, t5
+ lea t1, [t1 + t5 * 2]
+ add t2, t5
+
+ dec t8d
+ jnz .loop
+ RET
More information about the x265-devel
mailing list