[x265] [PATCH] asm-ssse3: filter_p2s[12x16](9.87x), filter_p2s[24x32](10.30x),
chen
chenm003 at 163.com
Wed Mar 4 17:11:58 CET 2015
At 2015-03-04 14:39:13,rajesh at multicorewareinc.com wrote:
># HG changeset patch
># User Rajesh Paulrajrajesh at multicorewareinc.com>
># Date 1425451088 -19800
># Wed Mar 04 12:08:08 2015 +0530
># Node ID 94991f753feae850b6edd371481e199f76243af3
># Parent 018e8bbaa854b1a4bd82b3a2e23f7775a77da5cc
>asm-ssse3: filter_p2s[12x16](9.87x), filter_p2s[24x32](10.30x),
> filter_p2s[48x64](9.60x)
>
@@ -5730,6 +5726,169 @@
> PIXEL_WH_64xN 64, 16
> PIXEL_WH_64xN 64, 32
> PIXEL_WH_64xN 64, 48
>+;-----------------------------------------------------------------------------
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
>+;-----------------------------------------------------------------------------
>+INIT_XMM ssse3
>+cglobal pixelToShort_12x16, 3, 4, 6
>+
>+ ; load constant
>+ mova m4, [pb_128]
>+ mova m5, [tab_c_64_n64]
>+
>+%rep 4
>+
>+ movu m0, [r0]
>+ movu m1, m0
copy between register always aligned
>+ punpcklbw m1, m0, m4
you copy into m1 a moment ago, why overwrite before use it?
>+ punpckhbw m0, m4
>+ pmaddubsw m0, m5
>+ pmaddubsw m1, m5
>+
>+ movu m2, [r0 + r1]
>+ movu m3, m2
>+ punpcklbw m3, m2, m4
>+ punpckhbw m2, m4
>+ pmaddubsw m2, m5
>+ pmaddubsw m3, m5
>+
>+ movu [r2 + FENC_STRIDE * 0], m1
>+ movu [r2 + FENC_STRIDE * 2], m3
>+
>+ movh [r2 + FENC_STRIDE * 0 + 16], m0
>+ movh [r2 + FENC_STRIDE * 2 + 16], m2
>+
>+ movu m0, [r0 + r1 * 2]
>+ movu m1, m0
>+ punpcklbw m1, m0, m4
>+ punpckhbw m0, m4
>+ pmaddubsw m0, m5
>+ pmaddubsw m1, m5
>+
>+ lea r3, [r0 + r1 * 2]
>+ movu m2, [r3 + r1]
>+ movu m3, m2
>+ punpcklbw m3, m2, m4
>+ punpckhbw m2, m4
>+ pmaddubsw m2, m5
>+ pmaddubsw m3, m5
>+
>+ movu [r2 + FENC_STRIDE * 4], m1
>+ movu [r2 + FENC_STRIDE * 6], m3
>+
>+ movh [r2 + FENC_STRIDE * 4 + 16], m0
>+ movh [r2 + FENC_STRIDE * 6 + 16], m2
>+
>+ lea r0, [r0 + r1 * 4]
>+ add r2, FENC_STRIDE * 8
>+%endrep
>+ RET
>+
>+;-----------------------------------------------------------------------------
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
>+;-----------------------------------------------------------------------------
>+INIT_XMM ssse3
>+cglobal pixelToShort_24x32, 3, 6, 6
>+
>+ ; load height
>+ mov r3d, 32
>+
>+ ; load constant
>+ mova m4, [pb_128]
>+ mova m5, [tab_c_64_n64]
>+
>+.loopH:
>+
>+ xor r4d, r4d
>+.loopW:
>+ lea r5, [r0 + r4]
>+
>+ movh m0, [r5]
>+ punpcklbw m0, m4
>+ pmaddubsw m0, m5
>+
>+ movh m1, [r5 + r1]
>+ punpcklbw m1, m4
>+ pmaddubsw m1, m5
>+
>+ movh m2, [r5 + r1 * 2]
>+ punpcklbw m2, m4
>+ pmaddubsw m2, m5
>+
>+ lea r5, [r5 + r1 * 2]
>+ movh m3, [r5 + r1]
>+ punpcklbw m3, m4
>+ pmaddubsw m3, m5
>+
>+ add r4, 8
>+ cmp r4, 24
>+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
>+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
>+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
>+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
>+ je .nextH
>+ jnz .loopW
>+
>+.nextH:
>+ lea r0, [r0 + r1 * 4]
>+ add r2, FENC_STRIDE * 8
>+
>+ sub r3d, 4
>+ jnz .loopH
>+ RET
>+
>+;-----------------------------------------------------------------------------
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
>+;-----------------------------------------------------------------------------
>+INIT_XMM ssse3
>+cglobal pixelToShort_48x64, 3, 6, 6
>+
>+ ; load height
>+ mov r3d, 64
>+
>+ ; load constant
>+ mova m4, [pb_128]
>+ mova m5, [tab_c_64_n64]
>+
>+.loopH:
>+
>+ xor r4d, r4d
>+.loopW:
>+ lea r5, [r0 + r4]
>+
>+ movh m0, [r5]
>+ punpcklbw m0, m4
>+ pmaddubsw m0, m5
>+
>+ movh m1, [r5 + r1]
>+ punpcklbw m1, m4
>+ pmaddubsw m1, m5
>+
>+ movh m2, [r5 + r1 * 2]
>+ punpcklbw m2, m4
>+ pmaddubsw m2, m5
>+
>+ lea r5, [r5 + r1 * 2]
>+ movh m3, [r5 + r1]
>+ punpcklbw m3, m4
>+ pmaddubsw m3, m5
>+
>+ add r4, 8
>+ cmp r4, 48
>+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
>+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
>+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
>+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
>+ je .nextH
>+ jnz .loopW
>+
>+.nextH:
>+ lea r0, [r0 + r1 * 4]
>+ add r2, FENC_STRIDE * 8
>+
>+ sub r3d, 4
>+ jnz .loopH
>+ RET
>
> %macro PROCESS_LUMA_W4_4R 0
> movd m0, [r0]
>diff -r 018e8bbaa854 -r 94991f753fea source/common/x86/ipfilter8.h
>--- a/source/common/x86/ipfilter8.h Fri Feb 27 11:46:09 2015 +0530
>+++ b/source/common/x86/ipfilter8.h Wed Mar 04 12:08:08 2015 +0530
>@@ -642,6 +642,10 @@
> void x265_pixelToShort_64x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> void x265_pixelToShort_64x48_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
> void x265_pixelToShort_64x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
>+void x265_pixelToShort_12x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
>+void x265_pixelToShort_24x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
>+void x265_pixelToShort_48x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
>+
> #undef LUMA_FILTERS
> #undef LUMA_SP_FILTERS
> #undef LUMA_SS_FILTERS
>diff -r 018e8bbaa854 -r 94991f753fea source/test/ipfilterharness.cpp
>--- a/source/test/ipfilterharness.cpp Fri Feb 27 11:46:09 2015 +0530
>+++ b/source/test/ipfilterharness.cpp Wed Mar 04 12:08:08 2015 +0530
>@@ -523,7 +523,7 @@
>
> checked(opt, pixel_test_buff[index] + i, rand_srcStride, IPF_vec_output_s);
>
>- if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(pixel)))
>+ if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
> return false;
>
> reportfail();
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150305/f7bc85e4/attachment-0001.html>
More information about the x265-devel
mailing list