[x265] [PATCH] cvt16to32_cnt optimization
chen
chenm003 at 163.com
Wed Sep 3 17:41:35 CEST 2014
I have a question, new function is cvt16to16_cnt, are we need it more?
other comment inline at below.
At 2014-09-02 22:11:31,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1409571425 -19800
># Node ID 51b5a6d820da97a4178dc42d2ef98ffe1970511b
># Parent c09f34b0ab57b4ce2f5cf4aa59c25d20eb6cbd54
>cvt16to32_cnt optimization
>
>diff -r c09f34b0ab57 -r 51b5a6d820da source/common/dct.cpp
>--- a/source/common/dct.cpp Mon Aug 25 16:54:19 2014 +0530
>+++ b/source/common/dct.cpp Mon Sep 01 17:07:05 2014 +0530
>@@ -834,7 +834,7 @@
> {
> for (int j = 0; j < trSize; j++)
> {
>- coeff[k * trSize + j] = ((int16_t)residual[k * stride + j]);
>+ coeff[k * trSize + j] = residual[k * stride + j];
memcpy?
> numSig += (residual[k * stride + j] != 0);
> }
> }
>diff -r c09f34b0ab57 -r 51b5a6d820da source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm Mon Aug 25 16:54:19 2014 +0530
>+++ b/source/common/x86/blockcopy8.asm Mon Sep 01 17:07:05 2014 +0530
>@@ -29,6 +29,10 @@
>
> tab_Vm: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
>
>+cextern pb_4
>+cextern pb_1
>+cextern pb_16
>+cextern pb_64
> cextern pw_4
> cextern pb_8
> cextern pb_32
>@@ -3946,52 +3950,47 @@
>
>
> ;--------------------------------------------------------------------------------------
>-; uint32_t cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
>+; uint32_t cvt16to32_cnt(int16_t *dst, int16_t *src, intptr_t stride);
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal cvt16to32_cnt_4, 3,3,5
> add r2d, r2d
> pxor m4, m4
>
>- ; row 0 & 1
>- movh m0, [r1]
>- movhps m0, [r1 + r2]
>- mova m2, m0
>- pmovsxwd m1, m0
>- punpckhwd m0, m0
>- psrad m0, 16
>- movu [r0 + 0 * mmsize], m1
>- movu [r0 + 1 * mmsize], m0
>-
>- ; row 2 & 3
>- movh m0, [r1 + r2 * 2]
>- lea r2, [r2 * 3]
>- movhps m0, [r1 + r2]
>- packsswb m2, m0
>- pcmpeqb m2, m4
>- pmovsxwd m1, m0
>- punpckhwd m0, m0
>- psrad m0, 16
>- movu [r0 + 2 * mmsize], m1
>- movu [r0 + 3 * mmsize], m0
>-
>- ; get count
>- ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
>-%if 1
>- pmovmskb eax, m2
>- not ax
>- popcnt ax, ax
>+ ; row 0 & 1
>+ movh m0, [r1]
>+ movh m1, [r1 + r2]
>+ movh [r0], m0
>+ movh [r0 + 8], m1
movh+movhps+mova?
>+ mova m2, [r0]
we may reduce a memory operator after above modify
>+
>+ ; row 2 & 3
>+ movh m0, [r1 + r2 * 2]
>+ lea r2, [r2 * 3]
>+ movh m1, [r1 + r2]
>+ movh [r0 + 16], m0
>+ movh [r0 + 24], m1
>+
>+ mova m0, [r0 + 16]
>+ packsswb m2, m0
>+ pcmpeqb m2, m4
>+
>+ ; get count
>+ ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
>+%if 0
>+ pmovmskb eax, m2
>+ not ax
>+ popcnt ax, ax
> %else
>- movhlps m3, m2
>- paddw m2, m3
>-
>- mova m3, [pw_4]
>- paddw m3, m2
>- psadbw m3, m4
>-
>- movd eax, m3
>-%endif
>- RET
>+ mova m0, [pb_1]
>+ paddb m2, m0
paddb m2, [pb_1]
>+ psadbw m2, m4
>+ pshufd m0, m2, 2
>+ paddw m2, m0
>+ movd eax, m2
>+ %endif
>+ RET
>
>
> INIT_YMM avx2
>@@ -4023,71 +4022,65 @@
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140903/e1d476f3/attachment-0001.html>
More information about the x265-devel
mailing list