[x265] [PATCH] asm: improve on intra_dc32
dave
dtyx265 at gmail.com
Sat Mar 7 00:58:13 CET 2015
On 03/06/2015 05:07 PM, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1425690429 28800
> # Node ID 63d132c844b9d299081b40e7589275b78fe71093
> # Parent 043c2418864b0a3ada6f597e6def6ead73d90b5f
> asm: improve on intra_dc32
> ---
> source/common/x86/intrapred8.asm | 71 ++++++++++++--------------------------
> 1 files changed, 22 insertions(+), 49 deletions(-)
>
> diff -r 043c2418864b -r 63d132c844b9 source/common/x86/intrapred8.asm
> --- a/source/common/x86/intrapred8.asm Fri Mar 06 13:15:55 2015 -0600
> +++ b/source/common/x86/intrapred8.asm Fri Mar 06 17:07:09 2015 -0800
> @@ -524,15 +524,21 @@
> pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...]
> pshufd m1, m1, 0x00
>
> + lea r2, [r0 + r1 * 2]
> %assign x 0
> -%rep 16
> +%rep 8
> ; store DC 16x16
> movu [r0], m1
> + movu [r0 + 16], m1
> movu [r0 + r1], m1
> - movu [r0 + 16], m1
> movu [r0 + r1 + 16], m1
> -%if x < 16
> - lea r0, [r0 + 2 * r1]
> + movu [r2], m1
> + movu [r2 + 16], m1
> + movu [r2 + r1], m1
> + movu [r2 + r1 + 16], m1
> +%if x < 8
> + lea r0, [r0 + 4 * r1]
> + lea r2, [r2 + 4 * r1]
All this does is trade 15 "lea r0..." for 7 "lea r0..." and 8 "lea r2..."
./test/TestBench --testbench intrapred | grep intra_dc_32x32
intra_dc_32x32[f=0] 4.45x 1680.01 7475.13
and the original code.
./test/TestBench --testbench intrapred | grep intra_dc_32x32
intra_dc_32x32[f=0] 4.53x 1650.03 7475.56
> %endif
> %assign x x+1
> %endrep
> @@ -996,14 +1002,13 @@
> ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
> ;---------------------------------------------------------------------------------------------
> INIT_XMM sse4
> -cglobal intra_pred_dc32, 3, 5, 5
> - lea r3, [r2 + 65]
> +cglobal intra_pred_dc32, 3,3,5
> inc r2
> pxor m0, m0
> movu m1, [r2]
> movu m2, [r2 + 16]
> - movu m3, [r3]
> - movu m4, [r3 + 16]
> + movu m3, [r2 + 64]
> + movu m4, [r2 + 64 + 16]
> psadbw m1, m0
> psadbw m2, m0
> psadbw m3, m0
> @@ -1014,54 +1019,22 @@
> pshufd m2, m1, 2
> paddw m1, m2
>
> - movd r4d, m1
> - add r4d, 32
> - shr r4d, 6 ; sum = sum / 64
> - movd m1, r4d
> - pshufb m1, m0 ; m1 = byte [dc_val ...]
> -
> -%rep 2
> + paddw m1, [pw_32] ; sum = (sum + 32) / 64
> + psrlw m1, 6
> + pshufb m1, m0 ; m1 = byte [dc_val ...]
> +
> + ; store DC 16x16
> +%assign x 0
> +%rep 16
> ; store DC 16x16
> movu [r0], m1
> + movu [r0 + 16], m1
> movu [r0 + r1], m1
> - movu [r0 + 16], m1
> movu [r0 + r1 + 16],m1
> + %if (x < 16)
> lea r0, [r0 + 2 * r1]
no "lea r2, ..." ?
> - movu [r0], m1
> - movu [r0 + r1], m1
> - movu [r0 + 16], m1
> - movu [r0 + r1 + 16],m1
> - lea r0, [r0 + 2 * r1]
> - movu [r0], m1
> - movu [r0 + r1], m1
> - movu [r0 + 16], m1
> - movu [r0 + r1 + 16],m1
> - lea r0, [r0 + 2 * r1]
> - movu [r0], m1
> - movu [r0 + r1], m1
> - movu [r0 + 16], m1
> - movu [r0 + r1 + 16],m1
> - lea r0, [r0 + 2 * r1]
> - movu [r0], m1
> - movu [r0 + r1], m1
> - movu [r0 + 16], m1
> - movu [r0 + r1 + 16],m1
> - lea r0, [r0 + 2 * r1]
> - movu [r0], m1
> - movu [r0 + r1], m1
> - movu [r0 + 16], m1
> - movu [r0 + r1 + 16],m1
> - lea r0, [r0 + 2 * r1]
> - movu [r0], m1
> - movu [r0 + r1], m1
> - movu [r0 + 16], m1
> - movu [r0 + r1 + 16],m1
> - lea r0, [r0 + 2 * r1]
> - movu [r0], m1
> - movu [r0 + r1], m1
> - movu [r0 + 16], m1
> - movu [r0 + r1 + 16],m1
> - lea r0, [r0 + 2 * r1]
> + %endif
> +%assign x x+1
> %endrep
>
> RET
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
More information about the x265-devel
mailing list