[x265] [PATCH] asm-sse2: 16bpp code for filter_p2s[4x4](2.67x), filter_p2s[4x8](3.12x),
chen
chenm003 at 163.com
Thu Mar 5 22:10:15 CET 2015
At 2015-03-05 20:07:40,rajesh at multicorewareinc.com wrote:
># HG changeset patch
># User Rajesh Paulraj<rajesh at multicorewareinc.com>
># Date 1425557077 -19800
># Thu Mar 05 17:34:37 2015 +0530
># Node ID cbac0290953fe5841760b4dfd387956c193df27c
># Parent 38cb572c2927eee8039464ec462b874c0da20871
>asm-sse2: 16bpp code for filter_p2s[4x4](2.67x), filter_p2s[4x8](3.12x),
>filter_p2s[4x16](3.11x), filter_p2s[8x4](4.90x), filter_p2s[8x8](4.54x),
>filter_p2s[8x16](5.68x), filter_p2s[8x32](6.28x), filter_p2s[16x4](7.98x),
>filter_p2s[16x8](9.87x), filter_p2s[16x12](9.66x), filter_p2s[16x16](10.24x),
>filter_p2s[16x32](10.38x), filter_p2s[16x64](10.36x), filter_p2s[32x8](8.06x),
>filter_p2s[32x16](7.09x), filter_p2s[32x24](7.98x), filter_p2s[32x32](7.09x),
>filter_p2s[32x64](8.11x), filter_p2s[64x16](8.03x), filter_p2s[64x32](7.11x),
>filter_p2s[64x48](7.89x), filter_p2s[64x64](6.79x), filter_p2s[12x16](8.35x),
>filter_p2s[24x32](10.31x), filter_p2s[48x64](6.89x)
>
>diff -r 38cb572c2927 -r cbac0290953f source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Thu Mar 05 17:19:57 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp Thu Mar 05 17:34:37 2015 +0530
>@@ -855,7 +855,32 @@
> PIXEL_AVG_W4(mmx2);
> LUMA_VAR(sse2);
>
>- p.luma_p2s = x265_luma_p2s_sse2;
>+ p.pu[LUMA_4x4].filter_p2s = x265_pixelToShort_4x4_sse2;
>+ p.pu[LUMA_4x8].filter_p2s = x265_pixelToShort_4x8_sse2;
>+ p.pu[LUMA_4x16].filter_p2s = x265_pixelToShort_4x16_sse2;
>+ p.pu[LUMA_8x4].filter_p2s = x265_pixelToShort_8x4_sse2;
>+ p.pu[LUMA_8x8].filter_p2s = x265_pixelToShort_8x8_sse2;
>+ p.pu[LUMA_8x16].filter_p2s = x265_pixelToShort_8x16_sse2;
>+ p.pu[LUMA_8x32].filter_p2s = x265_pixelToShort_8x32_sse2;
>+ p.pu[LUMA_16x4].filter_p2s = x265_pixelToShort_16x4_sse2;
>+ p.pu[LUMA_16x8].filter_p2s = x265_pixelToShort_16x8_sse2;
>+ p.pu[LUMA_16x12].filter_p2s = x265_pixelToShort_16x12_sse2;
>+ p.pu[LUMA_16x16].filter_p2s = x265_pixelToShort_16x16_sse2;
>+ p.pu[LUMA_16x32].filter_p2s = x265_pixelToShort_16x32_sse2;
>+ p.pu[LUMA_16x64].filter_p2s = x265_pixelToShort_16x64_sse2;
>+ p.pu[LUMA_32x8].filter_p2s = x265_pixelToShort_32x8_sse2;
>+ p.pu[LUMA_32x16].filter_p2s = x265_pixelToShort_32x16_sse2;
>+ p.pu[LUMA_32x24].filter_p2s = x265_pixelToShort_32x24_sse2;
>+ p.pu[LUMA_32x32].filter_p2s = x265_pixelToShort_32x32_sse2;
>+ p.pu[LUMA_32x64].filter_p2s = x265_pixelToShort_32x64_sse2;
>+ p.pu[LUMA_64x16].filter_p2s = x265_pixelToShort_64x16_sse2;
>+ p.pu[LUMA_64x32].filter_p2s = x265_pixelToShort_64x32_sse2;
>+ p.pu[LUMA_64x48].filter_p2s = x265_pixelToShort_64x48_sse2;
>+ p.pu[LUMA_64x64].filter_p2s = x265_pixelToShort_64x64_sse2;
>+ p.pu[LUMA_12x16].filter_p2s = x265_pixelToShort_12x16_sse2;
>+ p.pu[LUMA_24x32].filter_p2s = x265_pixelToShort_24x32_sse2;
>+ p.pu[LUMA_48x64].filter_p2s = x265_pixelToShort_48x64_sse2;
>+
> p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_sse2;
> p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_sse2;
>
>diff -r 38cb572c2927 -r cbac0290953f source/common/x86/ipfilter16.asm
>--- a/source/common/x86/ipfilter16.asm Thu Mar 05 17:19:57 2015 +0530
>+++ b/source/common/x86/ipfilter16.asm Thu Mar 05 17:34:37 2015 +0530
>@@ -3,6 +3,7 @@
> ;*
> ;* Authors: Nabajit Deka <nabajit at multicorewareinc.com>
> ;* Murugan Vairavel <murugan at multicorewareinc.com>
>+;* Rajesh Paulraj <rajesh at multicorewareinc.com>
> ;*
> ;* This program is free software; you can redistribute it and/or modify
> ;* it under the terms of the GNU General Public License as published by
>@@ -5525,65 +5526,472 @@
> FILTER_VER_LUMA_SS 64, 16
> FILTER_VER_LUMA_SS 16, 64
>
>-;--------------------------------------------------------------------------------------------------
>-; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
>-;--------------------------------------------------------------------------------------------------
>+;-----------------------------------------------------------------------------
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
>+;-----------------------------------------------------------------------------
>+%macro P2S_H_4xN 1
> INIT_XMM sse2
>-cglobal luma_p2s, 3, 7, 5
>-
>+cglobal pixelToShort_4x%1, 3, 5, 5
> add r1, r1
>
>- ; load width and height
>- mov r3d, r3m
>- mov r4d, r4m
>+ ; load height
>+ mov r3d, %1
%1/4, so we may use 'dec r3d' below
>
> ; load constant
> mova m4, [tab_c_n8192]
>
> .loopH:
>
>- xor r5d, r5d
>-.loopW:
>- lea r6, [r0 + r5 * 2]
>-
>- movu m0, [r6]
>+ movu m0, [r0]
> psllw m0, 4
> paddw m0, m4
>
>- movu m1, [r6 + r1]
>+ movu m1, [r0 + r1]
> psllw m1, 4
> paddw m1, m4
>
>- movu m2, [r6 + r1 * 2]
>+ movu m2, [r0 + r1 * 2]
> psllw m2, 4
> paddw m2, m4
>
>- lea r6, [r6 + r1 * 2]
>- movu m3, [r6 + r1]
>+ lea r4, [r0 + r1 * 2]
>+ movu m3, [r4 + r1]
> psllw m3, 4
> paddw m3, m4
>
>- add r5, 8
>- cmp r5, r3
>- jg .width4
>- movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
>- movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
>- movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
>- movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
>+ movh [r2 + FENC_STRIDE * 0 ], m0
>+ movh [r2 + FENC_STRIDE * 2 ], m1
>+ movh [r2 + FENC_STRIDE * 4 ], m2
>+ movh [r2 + FENC_STRIDE * 6 ], m3
>+
>+ lea r0, [r0 + r1 * 4]
>+ add r2, FENC_STRIDE * 8
>+
>+ sub r3d, 4
>+ jnz .loopH
>+
>+ RET
>+%endmacro
>+P2S_H_4xN 4
>+P2S_H_4xN 8
>+P2S_H_4xN 16
>+
>+;-----------------------------------------------------------------------------
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
>+;-----------------------------------------------------------------------------
>+%macro P2S_H_8xN 1
>+INIT_XMM sse2
>+cglobal pixelToShort_8x%1, 3, 5, 5
>+
>+ add r1, r1
>+
>+ ; load height
>+ mov r3d, %1
>+
>+ ; load constant
>+ mova m4, [tab_c_n8192]
>+
>+.loopH:
>+
>+ movu m0, [r0]
>+ psllw m0, 4
>+ paddw m0, m4
>+
>+ movu m1, [r0 + r1]
>+ psllw m1, 4
>+ paddw m1, m4
>+
>+ movu m2, [r0 + r1 * 2]
>+ psllw m2, 4
>+ paddw m2, m4
>+
>+ lea r4, [r0 + r1 * 2]
>+ movu m3, [r4 + r1]
>+ psllw m3, 4
>+ paddw m3, m4
>+
>+ movu [r2 + FENC_STRIDE * 0], m0
>+ movu [r2 + FENC_STRIDE * 2], m1
>+ movu [r2 + FENC_STRIDE * 4], m2
>+ movu [r2 + FENC_STRIDE * 6], m3
>+
>+ lea r0, [r0 + r1 * 4]
>+ add r2, FENC_STRIDE * 8
>+
>+ sub r3d, 4
>+ jnz .loopH
>+
>+ RET
>+%endmacro
>+P2S_H_8xN 8
>+P2S_H_8xN 4
>+P2S_H_8xN 16
>+P2S_H_8xN 32
>+
>+;-----------------------------------------------------------------------------
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
>+;-----------------------------------------------------------------------------
>+%macro P2S_H_16xN 1
>+INIT_XMM sse2
>+cglobal pixelToShort_16x%1, 3, 6, 5
>+
>+ add r1, r1
>+
>+ ; load height
>+ mov r3d, %1
>+
>+ ; load constant
>+ mova m4, [tab_c_n8192]
>+
>+.loopH:
>+ xor r4d, r4d
>+.loopW:
you process 4x8 area every loop, it have low cache performance, I suggest use 1xN or similar to get more performance
>+ lea r5, [r0 + r4 * 2]
>+
>+ movu m0, [r5]
>+ psllw m0, 4
>+ paddw m0, m4
>+
>+ movu m1, [r5 + r1]
>+ psllw m1, 4
>+ paddw m1, m4
>+
>+ movu m2, [r5 + r1 * 2]
>+ psllw m2, 4
>+ paddw m2, m4
>+
>+ lea r5, [r5 + r1 * 2]
>+ movu m3, [r5 + r1]
>+ psllw m3, 4
>+ paddw m3, m4
>+
>+ add r4, 8
>+ cmp r4, 16
>+
>+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
>+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
>+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
>+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
> je .nextH
>- jmp .loopW
>-
>-.width4:
>- movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
>- movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
>- movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
>- movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
>+ jnz .loopW
>
> .nextH:
> lea r0, [r0 + r1 * 4]
> add r2, FENC_STRIDE * 8
>
>- sub r4d, 4
>+ sub r3d, 4
> jnz .loopH
>
> RET
>+%endmacro
>+P2S_H_16xN 4
>+P2S_H_16xN 8
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150306/30f89f2a/attachment-0001.html>
More information about the x265-devel
mailing list