<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div>I have a question, new function is cvt16to16_cnt, are we need it more?</div><div>other comment inline at below.</div><pre><br>At 2014-09-02 22:11:31,praveen@multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1409571425 -19800
># Node ID 51b5a6d820da97a4178dc42d2ef98ffe1970511b
># Parent c09f34b0ab57b4ce2f5cf4aa59c25d20eb6cbd54
>cvt16to32_cnt optimization
>
>diff -r c09f34b0ab57 -r 51b5a6d820da source/common/dct.cpp
>--- a/source/common/dct.cpp Mon Aug 25 16:54:19 2014 +0530
>+++ b/source/common/dct.cpp Mon Sep 01 17:07:05 2014 +0530
>@@ -834,7 +834,7 @@
> {
> for (int j = 0; j < trSize; j++)
> {
>- coeff[k * trSize + j] = ((int16_t)residual[k * stride + j]);
>+ coeff[k * trSize + j] = residual[k * stride + j];
</pre><pre>memcpy?</pre><pre> </pre><pre>> numSig += (residual[k * stride + j] != 0);
> }
> }
>diff -r c09f34b0ab57 -r 51b5a6d820da source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm Mon Aug 25 16:54:19 2014 +0530
>+++ b/source/common/x86/blockcopy8.asm Mon Sep 01 17:07:05 2014 +0530
>@@ -29,6 +29,10 @@
>
> tab_Vm: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
>
>+cextern pb_4
>+cextern pb_1
>+cextern pb_16
>+cextern pb_64
> cextern pw_4
> cextern pb_8
> cextern pb_32
>@@ -3946,52 +3950,47 @@
>
>
> ;--------------------------------------------------------------------------------------
>-; uint32_t cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
>+; uint32_t cvt16to32_cnt(int16_t *dst, int16_t *src, intptr_t stride);
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal cvt16to32_cnt_4, 3,3,5
> add r2d, r2d
> pxor m4, m4
>
>- ; row 0 & 1
>- movh m0, [r1]
>- movhps m0, [r1 + r2]
>- mova m2, m0
>- pmovsxwd m1, m0
>- punpckhwd m0, m0
>- psrad m0, 16
>- movu [r0 + 0 * mmsize], m1
>- movu [r0 + 1 * mmsize], m0
>-
>- ; row 2 & 3
>- movh m0, [r1 + r2 * 2]
>- lea r2, [r2 * 3]
>- movhps m0, [r1 + r2]
>- packsswb m2, m0
>- pcmpeqb m2, m4
>- pmovsxwd m1, m0
>- punpckhwd m0, m0
>- psrad m0, 16
>- movu [r0 + 2 * mmsize], m1
>- movu [r0 + 3 * mmsize], m0
>-
>- ; get count
>- ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
>-%if 1
>- pmovmskb eax, m2
>- not ax
>- popcnt ax, ax
>+ ; row 0 & 1
>+ movh m0, [r1]
>+ movh m1, [r1 + r2]
>+ movh [r0], m0
>+ movh [r0 + 8], m1
</pre><pre>movh+movhps+mova?</pre><pre>>+ mova m2, [r0]
</pre><pre>we may reduce a memory operator after above modify</pre><pre>>+
>+ ; row 2 & 3
>+ movh m0, [r1 + r2 * 2]
>+ lea r2, [r2 * 3]
>+ movh m1, [r1 + r2]
>+ movh [r0 + 16], m0
>+ movh [r0 + 24], m1
>+
>+ mova m0, [r0 + 16]
>+ packsswb m2, m0
>+ pcmpeqb m2, m4
>+
>+ ; get count
>+ ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
>+%if 0
>+ pmovmskb eax, m2
>+ not ax
>+ popcnt ax, ax
> %else
>- movhlps m3, m2
>- paddw m2, m3
>-
>- mova m3, [pw_4]
>- paddw m3, m2
>- psadbw m3, m4
>-
>- movd eax, m3
>-%endif
>- RET
>+ mova m0, [pb_1]
>+ paddb m2, m0
</pre><pre>paddb m2, [pb_1]</pre><pre>>+ psadbw m2, m4
>+ pshufd m0, m2, 2
>+ paddw m2, m0
>+ movd eax, m2
>+ %endif
>+ RET
>
>
> INIT_YMM avx2
>@@ -4023,71 +4022,65 @@
>
>
</pre></div>