[x265] [PATCH] asm: improve on intra_dc32

dave dtyx265 at gmail.com
Sat Mar 7 00:58:13 CET 2015


On 03/06/2015 05:07 PM, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1425690429 28800
> # Node ID 63d132c844b9d299081b40e7589275b78fe71093
> # Parent  043c2418864b0a3ada6f597e6def6ead73d90b5f
> asm: improve on intra_dc32
> ---
>   source/common/x86/intrapred8.asm |   71 ++++++++++++--------------------------
>   1 files changed, 22 insertions(+), 49 deletions(-)
>
> diff -r 043c2418864b -r 63d132c844b9 source/common/x86/intrapred8.asm
> --- a/source/common/x86/intrapred8.asm	Fri Mar 06 13:15:55 2015 -0600
> +++ b/source/common/x86/intrapred8.asm	Fri Mar 06 17:07:09 2015 -0800
> @@ -524,15 +524,21 @@
>       pshuflw         m1,            m1, 0x00       ; m1 = byte [dc_val ...]
>       pshufd          m1,            m1, 0x00
>   
> +    lea             r2,            [r0 + r1 * 2]
>   %assign x 0
> -%rep 16
> +%rep 8
>       ; store DC 16x16
>       movu            [r0],               m1
> +    movu            [r0 + 16],          m1
>       movu            [r0 + r1],          m1
> -    movu            [r0 + 16],          m1
>       movu            [r0 + r1 + 16],     m1
> -%if x < 16
> -    lea             r0,            [r0 + 2 * r1]
> +    movu            [r2],               m1
> +    movu            [r2 + 16],          m1
> +    movu            [r2 + r1],          m1
> +    movu            [r2 + r1 + 16],     m1
> +%if x < 8
> +    lea             r0,            [r0 + 4 * r1]
> +    lea             r2,            [r2 + 4 * r1]
All this does is trade 15 "lea r0..." for 7 "lea r0..." and 8 "lea r2..."

./test/TestBench --testbench intrapred | grep intra_dc_32x32
intra_dc_32x32[f=0]    4.45x      1680.01       7475.13

and the original code.

./test/TestBench --testbench intrapred | grep intra_dc_32x32
intra_dc_32x32[f=0]    4.53x      1650.03       7475.56
>   %endif
>   %assign x x+1
>   %endrep
> @@ -996,14 +1002,13 @@
>   ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
>   ;---------------------------------------------------------------------------------------------
>   INIT_XMM sse4
> -cglobal intra_pred_dc32, 3, 5, 5
> -    lea             r3, [r2 + 65]
> +cglobal intra_pred_dc32, 3,3,5
>       inc             r2
>       pxor            m0,            m0
>       movu            m1,            [r2]
>       movu            m2,            [r2 + 16]
> -    movu            m3,            [r3]
> -    movu            m4,            [r3 + 16]
> +    movu            m3,            [r2 + 64]
> +    movu            m4,            [r2 + 64 + 16]
>       psadbw          m1,            m0
>       psadbw          m2,            m0
>       psadbw          m3,            m0
> @@ -1014,54 +1019,22 @@
>       pshufd          m2,            m1, 2
>       paddw           m1,            m2
>   
> -    movd            r4d,           m1
> -    add             r4d,           32
> -    shr             r4d,           6     ; sum = sum / 64
> -    movd            m1,            r4d
> -    pshufb          m1,            m0    ; m1 = byte [dc_val ...]
> -
> -%rep 2
> +    paddw           m1,            [pw_32]      ; sum = (sum + 32) / 64
> +    psrlw           m1,            6
> +    pshufb          m1,            m0           ; m1 = byte [dc_val ...]
> +
> +    ; store DC 16x16
> +%assign x 0
> +%rep 16
>       ; store DC 16x16
>       movu            [r0],          m1
> +    movu            [r0 + 16],     m1
>       movu            [r0 + r1],     m1
> -    movu            [r0 + 16],     m1
>       movu            [r0 + r1 + 16],m1
> +  %if (x < 16)
>       lea             r0,            [r0 + 2 * r1]
no "lea r2, ..." ?
> -    movu            [r0],          m1
> -    movu            [r0 + r1],     m1
> -    movu            [r0 + 16],     m1
> -    movu            [r0 + r1 + 16],m1
> -    lea             r0,            [r0 + 2 * r1]
> -    movu            [r0],          m1
> -    movu            [r0 + r1],     m1
> -    movu            [r0 + 16],     m1
> -    movu            [r0 + r1 + 16],m1
> -    lea             r0,            [r0 + 2 * r1]
> -    movu            [r0],          m1
> -    movu            [r0 + r1],     m1
> -    movu            [r0 + 16],     m1
> -    movu            [r0 + r1 + 16],m1
> -    lea             r0,            [r0 + 2 * r1]
> -    movu            [r0],          m1
> -    movu            [r0 + r1],     m1
> -    movu            [r0 + 16],     m1
> -    movu            [r0 + r1 + 16],m1
> -    lea             r0,            [r0 + 2 * r1]
> -    movu            [r0],          m1
> -    movu            [r0 + r1],     m1
> -    movu            [r0 + 16],     m1
> -    movu            [r0 + r1 + 16],m1
> -    lea             r0,            [r0 + 2 * r1]
> -    movu            [r0],          m1
> -    movu            [r0 + r1],     m1
> -    movu            [r0 + 16],     m1
> -    movu            [r0 + r1 + 16],m1
> -    lea             r0,            [r0 + 2 * r1]
> -    movu            [r0],          m1
> -    movu            [r0 + r1],     m1
> -    movu            [r0 + 16],     m1
> -    movu            [r0 + r1 + 16],m1
> -    lea             r0,            [r0 + 2 * r1]
> +  %endif
> +%assign x x+1
>   %endrep
>   
>       RET
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel



More information about the x265-devel mailing list