[x265] [PATCH 1 of 3] asm: residual buffer is alignment to size, so we can use alignment load instruction

Steve Borho steve at borho.org
Sat Nov 16 06:42:16 CET 2013


On Thu, Nov 14, 2013 at 11:18 PM, Min Chen <chenm003 at 163.com> wrote:

> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1384492636 -28800
> # Node ID 9592525e376f4b41d4fde000ae77814a00b06822
> # Parent  ee42f57411ae746095dd36e36064145ed869d73c
> asm: residual buffer is alignment to size, so we can use alignment load
> instruction
>


this patch is malformed, can you send it as a patch file?


> diff -r ee42f57411ae -r 9592525e376f source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp     Thu Nov 14 13:38:07 2013
> -0600
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp     Fri Nov 15 13:17:16 2013
> +0800
> @@ -502,6 +502,8 @@
>      }
>
>      //===== reconstruction =====
> +    assert(((uint32_t)residual & (width - 1)) == 0);
> +    assert(width <= 32);
>      primitives.calcrecon[size](pred, residual, recon, reconQt,
> reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
>
>      //===== update distortion =====
> @@ -636,6 +638,8 @@
>      }
>
>      //===== reconstruction =====
> +    assert(((uint32_t)residual & (width - 1)) == 0);
> +    assert(width <= 32);
>      primitives.calcrecon[size](pred, residual, recon, reconQt,
> reconIPred, stride, MAX_CU_SIZE / 2, reconIPredStride);
>
>      //===== update distortion =====
> diff -r ee42f57411ae -r 9592525e376f source/common/x86/pixel-util.asm
> --- a/source/common/x86/pixel-util.asm  Thu Nov 14 13:38:07 2013 -0600
> +++ b/source/common/x86/pixel-util.asm  Fri Nov 15 13:17:16 2013 +0800
> @@ -1,475 +1,469 @@
>
> -;*****************************************************************************
>
> -;* Copyright (C) 2013 x265 project
>
> -;*
>
> -;* Authors: Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
>
> -;*
>
> -;* This program is free software; you can redistribute it and/or modify
>
> -;* it under the terms of the GNU General Public License as published by
>
> -;* the Free Software Foundation; either version 2 of the License, or
>
> -;* (at your option) any later version.
>
> -;*
>
> -;* This program is distributed in the hope that it will be useful,
>
> -;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>
> -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>
> -;* GNU General Public License for more details.
>
> -;*
>
> -;* You should have received a copy of the GNU General Public License
>
> -;* along with this program; if not, write to the Free Software
>
> -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111,
> USA.
>
> -;*
>
> -;* This program is also available under a commercial proprietary license.
>
> -;* For more information, contact us at licensing at multicorewareinc.com.
>
>
> -;*****************************************************************************/
>
> -
>
> -%include "x86inc.asm"
>
> -%include "x86util.asm"
>
> -
>
> -SECTION_RODATA 32
>
> -
>
> -SECTION .text
>
> -
>
> -
>
>
> -;-----------------------------------------------------------------------------
>
> -; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift,
> int size)
>
>
> -;-----------------------------------------------------------------------------
>
> -INIT_XMM sse2
>
> -cglobal cvt32to16_shr, 5, 7, 1, dst, src, stride
>
> -%define rnd     m7
>
> -%define shift   m6
>
> -
>
> -    ; make shift
>
> -    mov         r5d, r3m
>
> -    movd        shift, r5d
>
> -
>
> -    ; make round
>
> -    dec         r5
>
> -    xor         r6, r6
>
> -    bts         r6, r5
>
> -
>
> -    movd        rnd, r6d
>
> -    pshufd      rnd, rnd, 0
>
> -
>
> -    ; register alloc
>
> -    ; r0 - dst
>
> -    ; r1 - src
>
> -    ; r2 - stride * 2 (short*)
>
> -    ; r3 - lx
>
> -    ; r4 - size
>
> -    ; r5 - ly
>
> -    ; r6 - diff
>
> -    lea         r2, [r2 * 2]
>
> -
>
> -    mov         r4d, r4m
>
> -    mov         r5, r4
>
> -    mov         r6, r2
>
> -    sub         r6, r4
>
> -    lea         r6, [r6 * 2]
>
> -
>
> -    shr         r5, 1
>
> -.loop_row:
>
> -
>
> -    mov         r3, r4
>
> -    shr         r3, 2
>
> -.loop_col:
>
> -    ; row 0
>
> -    movu        m0, [r1]
>
> -    paddd       m0, rnd
>
> -    psrad       m0, shift
>
> -    packssdw    m0, m0
>
> -    movh        [r0], m0
>
> -
>
> -    ; row 1
>
> -    movu        m0, [r1 + r4 * 4]
>
> -    paddd       m0, rnd
>
> -    psrad       m0, shift
>
> -    packssdw    m0, m0
>
> -    movh        [r0 + r2], m0
>
> -
>
> -    ; move col pointer
>
> -    add         r1, 16
>
> -    add         r0, 8
>
> -
>
> -    dec         r3
>
> -    jg          .loop_col
>
> -
>
> -    ; update pointer
>
> -    lea         r1, [r1 + r4 * 4]
>
> -    add         r0, r6
>
> -
>
> -    ; end of loop_row
>
> -    dec         r5
>
> -    jg         .loop_row
>
> -
>
> -    RET
>
> -
>
> -
>
>
> -;-----------------------------------------------------------------------------
>
> -; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t*
> reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
>
>
> -;-----------------------------------------------------------------------------
>
> -INIT_XMM sse2
>
> -cglobal calcRecons4
>
> -%if ARCH_X86_64 == 1
>
> -    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
>
> -    PROLOGUE 6,9,4
>
> -%else
>
> -    DECLARE_REG_TMP 0,1,2,3,4,5
>
> -    PROLOGUE 6,7,4
>
> -    %define t6      r6m
>
> -    %define t6d     r6d
>
> -    %define t7      r7m
>
> -    %define t8d     r6d
>
> -%endif
>
> -
>
> -    mov         t6d, r6m
>
> -%if ARCH_X86_64 == 0
>
> -    add         t6d, t6d
>
> -    mov         r6m, t6d
>
> -%else
>
> -    mov         r5d, r5m
>
> -    mov         r7d, r7m
>
> -    add         t6d, t6d
>
> -%endif
>
> -
>
> -    pxor        m0, m0
>
> -    mov         t8d, 4/2
>
> -.loop:
>
> -    movd        m1, [t0]
>
> -    movd        m2, [t0 + t5]
>
> -    punpckldq   m1, m2
>
> -    punpcklbw   m1, m0
>
> -    movh        m2, [t1]
>
> -    movh        m3, [t1 + t5 * 2]
>
> -    punpcklqdq  m2, m3
>
> -    paddw       m1, m2
>
> -    packuswb    m1, m1
>
> -
>
> -    ; store recon[] and recipred[]
>
> -    movd        [t2], m1
>
> -    movd        [t4], m1
>
> -    add         t4, t7
>
> -    pshufd      m2, m1, 1
>
> -    movd        [t2 + t5], m2
>
> -    movd        [t4], m2
>
> -    add         t4, t7
>
> -
>
> -    ; store recqt[]
>
> -    punpcklbw   m1, m0
>
> -    movlps      [t3], m1
>
> -    add         t3, t6
>
> -    movhps      [t3], m1
>
> -    add         t3, t6
>
> -
>
> -    lea         t0, [t0 + t5 * 2]
>
> -    lea         t1, [t1 + t5 * 4]
>
> -    lea         t2, [t2 + t5 * 2]
>
> -
>
> -    dec         t8d
>
> -    jnz        .loop
>
> -    RET
>
> -
>
> -
>
> -INIT_XMM sse2
>
> -cglobal calcRecons8
>
> -%if ARCH_X86_64 == 1
>
> -    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
>
> -    PROLOGUE 6,9,5
>
> -%else
>
> -    DECLARE_REG_TMP 0,1,2,3,4,5
>
> -    PROLOGUE 6,7,5
>
> -    %define t6      r6m
>
> -    %define t6d     r6d
>
> -    %define t7      r7m
>
> -    %define t8d     r6d
>
> -%endif
>
> -
>
> -    mov         t6d, r6m
>
> -%if ARCH_X86_64 == 0
>
> -    add         t6d, t6d
>
> -    mov         r6m, t6d
>
> -%else
>
> -    mov         r5d, r5m
>
> -    mov         r7d, r7m
>
> -    add         t6d, t6d
>
> -%endif
>
> -
>
> -    pxor        m0, m0
>
> -    mov         t8d, 8/2
>
> -.loop:
>
> -    movh        m1, [t0]
>
> -    movh        m2, [t0 + t5]
>
> -    punpcklbw   m1, m0
>
> -    punpcklbw   m2, m0
>
> -    movu        m3, [t1]
>
> -    movu        m4, [t1 + t5 * 2]
>
> -    paddw       m1, m3
>
> -    paddw       m2, m4
>
> -    packuswb    m1, m2
>
> -
>
> -    ; store recon[] and recipred[]
>
> -    movlps      [t2], m1
>
> -    movhps      [t2 + t5], m1
>
> -    movlps      [t4], m1
>
> -%if ARCH_X86_64 == 0
>
> -    add         t4, t7
>
> -    movhps      [t4], m1
>
> -    add         t4, t7
>
> -%else
>
> -    movhps      [t4 + t7], m1
>
> -    lea         t4, [t4 + t7 * 2]
>
> -%endif
>
> -
>
> -    ; store recqt[]
>
> -    punpcklbw   m2, m1, m0
>
> -    punpckhbw   m1, m0
>
> -    movu        [t3], m2
>
> -    add         t3, t6
>
> -    movu        [t3], m1
>
> -    add         t3, t6
>
> -
>
> -    lea         t0, [t0 + t5 * 2]
>
> -    lea         t1, [t1 + t5 * 4]
>
> -    lea         t2, [t2 + t5 * 2]
>
> -
>
> -    dec         t8d
>
> -    jnz        .loop
>
> -    RET
>
> -
>
> -
>
> -INIT_XMM sse4
>
> -cglobal calcRecons16
>
> -%if ARCH_X86_64 == 1
>
> -    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
>
> -    PROLOGUE 6,9,5
>
> -%else
>
> -    DECLARE_REG_TMP 0,1,2,3,4,5
>
> -    PROLOGUE 6,7,5
>
> -    %define t6      r6m
>
> -    %define t6d     r6d
>
> -    %define t7      r7m
>
> -    %define t8d     r6d
>
> -%endif
>
> -
>
> -    mov         t6d, r6m
>
> -%if ARCH_X86_64 == 0
>
> -    add         t6d, t6d
>
> -    mov         r6m, t6d
>
> -%else
>
> -    mov         r5d, r5m
>
> -    mov         r7d, r7m
>
> -    add         t6d, t6d
>
> -%endif
>
> -
>
> -    pxor        m0, m0
>
> -    mov         t8d, 16
>
> -.loop:
>
> -    movu        m2, [t0]
>
> -    pmovzxbw    m1, m2
>
> -    punpckhbw   m2, m0
>
> -    movu        m3, [t1]
>
> -    movu        m4, [t1 + 16]
>
> -    paddw       m1, m3
>
> -    paddw       m2, m4
>
> -    packuswb    m1, m2
>
> -
>
> -    ; store recon[] and recipred[]
>
> -    movu        [t2], m1
>
> -    movu        [t4], m1
>
> -
>
> -    ; store recqt[]
>
> -    pmovzxbw    m2, m1
>
> -    punpckhbw   m1, m0
>
> -    movu        [t3], m2
>
> -    movu        [t3 + 16], m1
>
> -
>
> -    add         t3, t6
>
> -    add         t4, t7
>
> -    add         t0, t5
>
> -    lea         t1, [t1 + t5 * 2]
>
> -    add         t2, t5
>
> -
>
> -    dec         t8d
>
> -    jnz        .loop
>
> -    RET
>
> -
>
> -
>
> -INIT_XMM sse4
>
> -cglobal calcRecons32
>
> -%if ARCH_X86_64 == 1
>
> -    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
>
> -    PROLOGUE 6,9,7
>
> -%else
>
> -    DECLARE_REG_TMP 0,1,2,3,4,5
>
> -    PROLOGUE 6,7,7
>
> -    %define t6      r6m
>
> -    %define t6d     r6d
>
> -    %define t7      r7m
>
> -    %define t8d     r6d
>
> -%endif
>
> -
>
> -    mov         t6d, r6m
>
> -%if ARCH_X86_64 == 0
>
> -    add         t6d, t6d
>
> -    mov         r6m, t6d
>
> -%else
>
> -    mov         r5d, r5m
>
> -    mov         r7d, r7m
>
> -    add         t6d, t6d
>
> -%endif
>
> -
>
> -    pxor        m0, m0
>
> -    mov         t8d, 32
>
> -.loop:
>
> -    movu        m2, [t0]
>
> -    movu        m4, [t0 + 16]
>
> -    pmovzxbw    m1, m2
>
> -    punpckhbw   m2, m0
>
> -    pmovzxbw    m3, m4
>
> -    punpckhbw   m4, m0
>
> -
>
> -    movu        m5, [t1 + 0 * 16]
>
> -    movu        m6, [t1 + 1 * 16]
>
> -    paddw       m1, m5
>
> -    paddw       m2, m6
>
> -    packuswb    m1, m2
>
> -
>
> -    movu        m5, [t1 + 2 * 16]
>
> -    movu        m6, [t1 + 3 * 16]
>
> -    paddw       m3, m5
>
> -    paddw       m4, m6
>
> -    packuswb    m3, m4
>
> -
>
> -    ; store recon[] and recipred[]
>
> -    movu        [t2], m1
>
> -    movu        [t2 + 16], m3
>
> -    movu        [t4], m1
>
> -    movu        [t4 + 16], m3
>
> -
>
> -    ; store recqt[]
>
> -    pmovzxbw    m2, m1
>
> -    punpckhbw   m1, m0
>
> -    movu        [t3 + 0 * 16], m2
>
> -    movu        [t3 + 1 * 16], m1
>
> -    pmovzxbw    m4, m3
>
> -    punpckhbw   m3, m0
>
> -    movu        [t3 + 2 * 16], m4
>
> -    movu        [t3 + 3 * 16], m3
>
> -
>
> -    add         t3, t6
>
> -    add         t4, t7
>
> -    add         t0, t5
>
> -    lea         t1, [t1 + t5 * 2]
>
> -    add         t2, t5
>
> -
>
> -    dec         t8d
>
> -    jnz        .loop
>
> -    RET
>
> -
>
> -
>
> -INIT_XMM sse4
>
> -cglobal calcRecons64
>
> -%if ARCH_X86_64 == 1
>
> -    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
>
> -    PROLOGUE 6,9,7
>
> -%else
>
> -    DECLARE_REG_TMP 0,1,2,3,4,5
>
> -    PROLOGUE 6,7,7
>
> -    %define t6      r6m
>
> -    %define t6d     r6d
>
> -    %define t7      r7m
>
> -    %define t8d     r6d
>
> -%endif
>
> -
>
> -    mov         t6d, r6m
>
> -%if ARCH_X86_64 == 0
>
> -    add         t6d, t6d
>
> -    mov         r6m, t6d
>
> -%else
>
> -    mov         r5d, r5m
>
> -    mov         r7d, r7m
>
> -    add         t6d, t6d
>
> -%endif
>
> -
>
> -    pxor        m0, m0
>
> -    mov         t8d, 64
>
> -.loop:
>
> -    ; left 32 pixel
>
> -    movu        m2, [t0 + 0 * 16]
>
> -    movu        m4, [t0 + 1 * 16]
>
> -    pmovzxbw    m1, m2
>
> -    punpckhbw   m2, m0
>
> -    pmovzxbw    m3, m4
>
> -    punpckhbw   m4, m0
>
> -
>
> -    movu        m5, [t1 + 0 * 16]
>
> -    movu        m6, [t1 + 1 * 16]
>
> -    paddw       m1, m5
>
> -    paddw       m2, m6
>
> -    packuswb    m1, m2
>
> -
>
> -    movu        m5, [t1 + 2 * 16]
>
> -    movu        m6, [t1 + 3 * 16]
>
> -    paddw       m3, m5
>
> -    paddw       m4, m6
>
> -    packuswb    m3, m4
>
> -
>
> -    ; store recon[] and recipred[]
>
> -    movu        [t2 + 0 * 16], m1
>
> -    movu        [t2 + 1 * 16], m3
>
> -    movu        [t4 + 0 * 16], m1
>
> -    movu        [t4 + 1 * 16], m3
>
> -
>
> -    ; store recqt[]
>
> -    pmovzxbw    m2, m1
>
> -    punpckhbw   m1, m0
>
> -    movu        [t3 + 0 * 16], m2
>
> -    movu        [t3 + 1 * 16], m1
>
> -    pmovzxbw    m4, m3
>
> -    punpckhbw   m3, m0
>
> -    movu        [t3 + 2 * 16], m4
>
> -    movu        [t3 + 3 * 16], m3
>
> -
>
> -    ; right 32 pixel
>
> -    movu        m2, [t0 + 2 * 16]
>
> -    movu        m4, [t0 + 3 * 16]
>
> -    pmovzxbw    m1, m2
>
> -    punpckhbw   m2, m0
>
> -    pmovzxbw    m3, m4
>
> -    punpckhbw   m4, m0
>
> -
>
> -    movu        m5, [t1 + 4 * 16]
>
> -    movu        m6, [t1 + 5 * 16]
>
> -    paddw       m1, m5
>
> -    paddw       m2, m6
>
> -    packuswb    m1, m2
>
> -
>
> -    movu        m5, [t1 + 6 * 16]
>
> -    movu        m6, [t1 + 7 * 16]
>
> -    paddw       m3, m5
>
> -    paddw       m4, m6
>
> -    packuswb    m3, m4
>
> -
>
> -    ; store recon[] and recipred[]
>
> -    movu        [t2 + 2 * 16], m1
>
> -    movu        [t2 + 3 * 16], m3
>
> -    movu        [t4 + 2 * 16], m1
>
> -    movu        [t4 + 3 * 16], m3
>
> -
>
> -    ; store recqt[]
>
> -    pmovzxbw    m2, m1
>
> -    punpckhbw   m1, m0
>
> -    movu        [t3 + 4 * 16], m2
>
> -    movu        [t3 + 5 * 16], m1
>
> -    pmovzxbw    m4, m3
>
> -    punpckhbw   m3, m0
>
> -    movu        [t3 + 6 * 16], m4
>
> -    movu        [t3 + 7 * 16], m3
>
> -
>
> -    add         t3, t6
>
> -    add         t4, t7
>
> -    add         t0, t5
>
> -    lea         t1, [t1 + t5 * 2]
>
> -    add         t2, t5
>
> -
>
> -    dec         t8d
>
> -    jnz        .loop
>
> -    RET
>
>
> +;*****************************************************************************
> +;* Copyright (C) 2013 x265 project
> +;*
> +;* Authors: Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
> +;*
> +;* This program is free software; you can redistribute it and/or modify
> +;* it under the terms of the GNU General Public License as published by
> +;* the Free Software Foundation; either version 2 of the License, or
> +;* (at your option) any later version.
> +;*
> +;* This program is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +;* GNU General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU General Public License
> +;* along with this program; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111,
> USA.
> +;*
> +;* This program is also available under a commercial proprietary license.
> +;* For more information, contact us at licensing at multicorewareinc.com.
>
> +;*****************************************************************************/
> +
> +%include "x86inc.asm"
> +%include "x86util.asm"
> +
> +SECTION_RODATA 32
> +
> +SECTION .text
> +
> +
>
> +;-----------------------------------------------------------------------------
> +; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift,
> int size)
>
> +;-----------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal cvt32to16_shr, 5, 7, 1, dst, src, stride
> +%define rnd     m7
> +%define shift   m6
> +
> +    ; make shift
> +    mov         r5d, r3m
> +    movd        shift, r5d
> +
> +    ; make round
> +    dec         r5
> +    xor         r6, r6
> +    bts         r6, r5
> +
> +    movd        rnd, r6d
> +    pshufd      rnd, rnd, 0
> +
> +    ; register alloc
> +    ; r0 - dst
> +    ; r1 - src
> +    ; r2 - stride * 2 (short*)
> +    ; r3 - lx
> +    ; r4 - size
> +    ; r5 - ly
> +    ; r6 - diff
> +    lea         r2, [r2 * 2]
> +
> +    mov         r4d, r4m
> +    mov         r5, r4
> +    mov         r6, r2
> +    sub         r6, r4
> +    lea         r6, [r6 * 2]
> +
> +    shr         r5, 1
> +.loop_row:
> +
> +    mov         r3, r4
> +    shr         r3, 2
> +.loop_col:
> +    ; row 0
> +    movu        m0, [r1]
> +    paddd       m0, rnd
> +    psrad       m0, shift
> +    packssdw    m0, m0
> +    movh        [r0], m0
> +
> +    ; row 1
> +    movu        m0, [r1 + r4 * 4]
> +    paddd       m0, rnd
> +    psrad       m0, shift
> +    packssdw    m0, m0
> +    movh        [r0 + r2], m0
> +
> +    ; move col pointer
> +    add         r1, 16
> +    add         r0, 8
> +
> +    dec         r3
> +    jg          .loop_col
> +
> +    ; update pointer
> +    lea         r1, [r1 + r4 * 4]
> +    add         r0, r6
> +
> +    ; end of loop_row
> +    dec         r5
> +    jg         .loop_row
> +
> +    RET
> +
> +
>
> +;-----------------------------------------------------------------------------
> +; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t*
> reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
>
> +;-----------------------------------------------------------------------------
> +INIT_XMM sse2
> +cglobal calcRecons4
> +%if ARCH_X86_64 == 1
> +    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> +    PROLOGUE 6,9,4
> +%else
> +    DECLARE_REG_TMP 0,1,2,3,4,5
> +    PROLOGUE 6,7,4
> +    %define t6      r6m
> +    %define t6d     r6d
> +    %define t7      r7m
> +    %define t8d     r6d
> +%endif
> +
> +    mov         t6d, r6m
> +%if ARCH_X86_64 == 0
> +    add         t6d, t6d
> +    mov         r6m, t6d
> +%else
> +    mov         r5d, r5m
> +    mov         r7d, r7m
> +    add         t6d, t6d
> +%endif
> +
> +    pxor        m0, m0
> +    mov         t8d, 4/2
> +.loop:
> +    movd        m1, [t0]
> +    movd        m2, [t0 + t5]
> +    punpckldq   m1, m2
> +    punpcklbw   m1, m0
> +    movh        m2, [t1]
> +    movh        m3, [t1 + t5 * 2]
> +    punpcklqdq  m2, m3
> +    paddw       m1, m2
> +    packuswb    m1, m1
> +
> +    ; store recon[] and recipred[]
> +    movd        [t2], m1
> +    movd        [t4], m1
> +    add         t4, t7
> +    pshufd      m2, m1, 1
> +    movd        [t2 + t5], m2
> +    movd        [t4], m2
> +    add         t4, t7
> +
> +    ; store recqt[]
> +    punpcklbw   m1, m0
> +    movlps      [t3], m1
> +    add         t3, t6
> +    movhps      [t3], m1
> +    add         t3, t6
> +
> +    lea         t0, [t0 + t5 * 2]
> +    lea         t1, [t1 + t5 * 4]
> +    lea         t2, [t2 + t5 * 2]
> +
> +    dec         t8d
> +    jnz        .loop
> +    RET
> +
> +
> +INIT_XMM sse2
> +cglobal calcRecons8
> +%if ARCH_X86_64 == 1
> +    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> +    PROLOGUE 6,9,5
> +%else
> +    DECLARE_REG_TMP 0,1,2,3,4,5
> +    PROLOGUE 6,7,5
> +    %define t6      r6m
> +    %define t6d     r6d
> +    %define t7      r7m
> +    %define t8d     r6d
> +%endif
> +
> +    mov         t6d, r6m
> +%if ARCH_X86_64 == 0
> +    add         t6d, t6d
> +    mov         r6m, t6d
> +%else
> +    mov         r5d, r5m
> +    mov         r7d, r7m
> +    add         t6d, t6d
> +%endif
> +
> +    pxor        m0, m0
> +    mov         t8d, 8/2
> +.loop:
> +    movh        m1, [t0]
> +    movh        m2, [t0 + t5]
> +    punpcklbw   m1, m0
> +    punpcklbw   m2, m0
> +    movu        m3, [t1]
> +    movu        m4, [t1 + t5 * 2]
> +    paddw       m1, m3
> +    paddw       m2, m4
> +    packuswb    m1, m2
> +
> +    ; store recon[] and recipred[]
> +    movlps      [t2], m1
> +    movhps      [t2 + t5], m1
> +    movlps      [t4], m1
> +%if ARCH_X86_64 == 0
> +    add         t4, t7
> +    movhps      [t4], m1
> +    add         t4, t7
> +%else
> +    movhps      [t4 + t7], m1
> +    lea         t4, [t4 + t7 * 2]
> +%endif
> +
> +    ; store recqt[]
> +    punpcklbw   m2, m1, m0
> +    punpckhbw   m1, m0
> +    movu        [t3], m2
> +    add         t3, t6
> +    movu        [t3], m1
> +    add         t3, t6
> +
> +    lea         t0, [t0 + t5 * 2]
> +    lea         t1, [t1 + t5 * 4]
> +    lea         t2, [t2 + t5 * 2]
> +
> +    dec         t8d
> +    jnz        .loop
> +    RET
> +
> +
> +INIT_XMM sse4
> +cglobal calcRecons16
> +%if ARCH_X86_64 == 1
> +    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> +    PROLOGUE 6,9,3
> +%else
> +    DECLARE_REG_TMP 0,1,2,3,4,5
> +    PROLOGUE 6,7,3
> +    %define t6      r6m
> +    %define t6d     r6d
> +    %define t7      r7m
> +    %define t8d     r6d
> +%endif
> +
> +    mov         t6d, r6m
> +%if ARCH_X86_64 == 0
> +    add         t6d, t6d
> +    mov         r6m, t6d
> +%else
> +    mov         r5d, r5m
> +    mov         r7d, r7m
> +    add         t6d, t6d
> +%endif
> +
> +    pxor        m0, m0
> +    mov         t8d, 16
> +.loop:
> +    movu        m2, [t0]
> +    pmovzxbw    m1, m2
> +    punpckhbw   m2, m0
> +    paddw       m1, [t1]
> +    paddw       m2, [t1 + 16]
> +    packuswb    m1, m2
> +
> +    ; store recon[] and recipred[]
> +    movu        [t2], m1
> +    movu        [t4], m1
> +
> +    ; store recqt[]
> +    pmovzxbw    m2, m1
> +    punpckhbw   m1, m0
> +    movu        [t3], m2
> +    movu        [t3 + 16], m1
> +
> +    add         t3, t6
> +    add         t4, t7
> +    add         t0, t5
> +    lea         t1, [t1 + t5 * 2]
> +    add         t2, t5
> +
> +    dec         t8d
> +    jnz        .loop
> +    RET
> +
> +
> +INIT_XMM sse4
> +cglobal calcRecons32
> +%if ARCH_X86_64 == 1
> +    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> +    PROLOGUE 6,9,5
> +%else
> +    DECLARE_REG_TMP 0,1,2,3,4,5
> +    PROLOGUE 6,7,5
> +    %define t6      r6m
> +    %define t6d     r6d
> +    %define t7      r7m
> +    %define t8d     r6d
> +%endif
> +
> +    mov         t6d, r6m
> +%if ARCH_X86_64 == 0
> +    add         t6d, t6d
> +    mov         r6m, t6d
> +%else
> +    mov         r5d, r5m
> +    mov         r7d, r7m
> +    add         t6d, t6d
> +%endif
> +
> +    pxor        m0, m0
> +    mov         t8d, 32
> +.loop:
> +    movu        m2, [t0]
> +    movu        m4, [t0 + 16]
> +    pmovzxbw    m1, m2
> +    punpckhbw   m2, m0
> +    pmovzxbw    m3, m4
> +    punpckhbw   m4, m0
> +
> +    paddw       m1, [t1 + 0 * 16]
> +    paddw       m2, [t1 + 1 * 16]
> +    packuswb    m1, m2
> +
> +    paddw       m3, [t1 + 2 * 16]
> +    paddw       m4, [t1 + 3 * 16]
> +    packuswb    m3, m4
> +
> +    ; store recon[] and recipred[]
> +    movu        [t2], m1
> +    movu        [t2 + 16], m3
> +    movu        [t4], m1
> +    movu        [t4 + 16], m3
> +
> +    ; store recqt[]
> +    pmovzxbw    m2, m1
> +    punpckhbw   m1, m0
> +    movu        [t3 + 0 * 16], m2
> +    movu        [t3 + 1 * 16], m1
> +    pmovzxbw    m4, m3
> +    punpckhbw   m3, m0
> +    movu        [t3 + 2 * 16], m4
> +    movu        [t3 + 3 * 16], m3
> +
> +    add         t3, t6
> +    add         t4, t7
> +    add         t0, t5
> +    lea         t1, [t1 + t5 * 2]
> +    add         t2, t5
> +
> +    dec         t8d
> +    jnz        .loop
> +    RET
> +
> +
> +INIT_XMM sse4
> +cglobal calcRecons64
> +%if ARCH_X86_64 == 1
> +    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> +    PROLOGUE 6,9,7
> +%else
> +    DECLARE_REG_TMP 0,1,2,3,4,5
> +    PROLOGUE 6,7,7
> +    %define t6      r6m
> +    %define t6d     r6d
> +    %define t7      r7m
> +    %define t8d     r6d
> +%endif
> +
> +    mov         t6d, r6m
> +%if ARCH_X86_64 == 0
> +    add         t6d, t6d
> +    mov         r6m, t6d
> +%else
> +    mov         r5d, r5m
> +    mov         r7d, r7m
> +    add         t6d, t6d
> +%endif
> +
> +    pxor        m0, m0
> +    mov         t8d, 64
> +.loop:
> +    ; left 32 pixel
> +    movu        m2, [t0 + 0 * 16]
> +    movu        m4, [t0 + 1 * 16]
> +    pmovzxbw    m1, m2
> +    punpckhbw   m2, m0
> +    pmovzxbw    m3, m4
> +    punpckhbw   m4, m0
> +
> +    movu        m5, [t1 + 0 * 16]
> +    movu        m6, [t1 + 1 * 16]
> +    paddw       m1, m5
> +    paddw       m2, m6
> +    packuswb    m1, m2
> +
> +    movu        m5, [t1 + 2 * 16]
> +    movu        m6, [t1 + 3 * 16]
> +    paddw       m3, m5
> +    paddw       m4, m6
> +    packuswb    m3, m4
> +
> +    ; store recon[] and recipred[]
> +    movu        [t2 + 0 * 16], m1
> +    movu        [t2 + 1 * 16], m3
> +    movu        [t4 + 0 * 16], m1
> +    movu        [t4 + 1 * 16], m3
> +
> +    ; store recqt[]
> +    pmovzxbw    m2, m1
> +    punpckhbw   m1, m0
> +    movu        [t3 + 0 * 16], m2
> +    movu        [t3 + 1 * 16], m1
> +    pmovzxbw    m4, m3
> +    punpckhbw   m3, m0
> +    movu        [t3 + 2 * 16], m4
> +    movu        [t3 + 3 * 16], m3
> +
> +    ; right 32 pixel
> +    movu        m2, [t0 + 2 * 16]
> +    movu        m4, [t0 + 3 * 16]
> +    pmovzxbw    m1, m2
> +    punpckhbw   m2, m0
> +    pmovzxbw    m3, m4
> +    punpckhbw   m4, m0
> +
> +    movu        m5, [t1 + 4 * 16]
> +    movu        m6, [t1 + 5 * 16]
> +    paddw       m1, m5
> +    paddw       m2, m6
> +    packuswb    m1, m2
> +
> +    movu        m5, [t1 + 6 * 16]
> +    movu        m6, [t1 + 7 * 16]
> +    paddw       m3, m5
> +    paddw       m4, m6
> +    packuswb    m3, m4
> +
> +    ; store recon[] and recipred[]
> +    movu        [t2 + 2 * 16], m1
> +    movu        [t2 + 3 * 16], m3
> +    movu        [t4 + 2 * 16], m1
> +    movu        [t4 + 3 * 16], m3
> +
> +    ; store recqt[]
> +    pmovzxbw    m2, m1
> +    punpckhbw   m1, m0
> +    movu        [t3 + 4 * 16], m2
> +    movu        [t3 + 5 * 16], m1
> +    pmovzxbw    m4, m3
> +    punpckhbw   m3, m0
> +    movu        [t3 + 6 * 16], m4
> +    movu        [t3 + 7 * 16], m3
> +
> +    add         t3, t6
> +    add         t4, t7
> +    add         t0, t5
> +    lea         t1, [t1 + t5 * 2]
> +    add         t2, t5
> +
> +    dec         t8d
> +    jnz        .loop
> +    RET
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131115/77d1650e/attachment-0001.html>


More information about the x265-devel mailing list