[x265] [PATCH] cvt16to32_cnt optimization

chen chenm003 at 163.com
Wed Sep 3 17:41:35 CEST 2014


I have a question, new function is  cvt16to16_cnt, are we need it more?
other comment inline at below.

At 2014-09-02 22:11:31,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1409571425 -19800
># Node ID 51b5a6d820da97a4178dc42d2ef98ffe1970511b
># Parent  c09f34b0ab57b4ce2f5cf4aa59c25d20eb6cbd54
>cvt16to32_cnt optimization
>
>diff -r c09f34b0ab57 -r 51b5a6d820da source/common/dct.cpp
>--- a/source/common/dct.cpp	Mon Aug 25 16:54:19 2014 +0530
>+++ b/source/common/dct.cpp	Mon Sep 01 17:07:05 2014 +0530
>@@ -834,7 +834,7 @@
>     {
>         for (int j = 0; j < trSize; j++)
>         {
>-            coeff[k * trSize + j] = ((int16_t)residual[k * stride + j]);
>+            coeff[k * trSize + j] = residual[k * stride + j];

memcpy?
 
>             numSig += (residual[k * stride + j] != 0);
>         }
>     }
>diff -r c09f34b0ab57 -r 51b5a6d820da source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm	Mon Aug 25 16:54:19 2014 +0530
>+++ b/source/common/x86/blockcopy8.asm	Mon Sep 01 17:07:05 2014 +0530
>@@ -29,6 +29,10 @@
> 
> tab_Vm:    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
> 
>+cextern pb_4
>+cextern pb_1
>+cextern pb_16
>+cextern pb_64
> cextern pw_4
> cextern pb_8
> cextern pb_32
>@@ -3946,52 +3950,47 @@
> 
> 
> ;--------------------------------------------------------------------------------------
>-; uint32_t cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
>+; uint32_t cvt16to32_cnt(int16_t *dst, int16_t *src, intptr_t stride);
> ;--------------------------------------------------------------------------------------
> INIT_XMM sse4
> cglobal cvt16to32_cnt_4, 3,3,5
>     add         r2d, r2d
>     pxor        m4, m4
> 
>-    ; row 0 & 1
>-    movh        m0, [r1]
>-    movhps      m0, [r1 + r2]
>-    mova        m2, m0
>-    pmovsxwd    m1, m0
>-    punpckhwd   m0, m0
>-    psrad       m0, 16
>-    movu        [r0 + 0 * mmsize], m1
>-    movu        [r0 + 1 * mmsize], m0
>-
>-    ; row 2 & 3
>-    movh        m0, [r1 + r2 * 2]
>-    lea         r2, [r2 * 3]
>-    movhps      m0, [r1 + r2]
>-    packsswb    m2, m0
>-    pcmpeqb     m2, m4
>-    pmovsxwd    m1, m0
>-    punpckhwd   m0, m0
>-    psrad       m0, 16
>-    movu        [r0 + 2 * mmsize], m1
>-    movu        [r0 + 3 * mmsize], m0
>-
>-    ; get count
>-    ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
>-%if 1
>-    pmovmskb    eax, m2
>-    not         ax
>-    popcnt      ax, ax
>+     ; row 0 & 1
>+     movh        m0, [r1]
>+     movh        m1, [r1 + r2]
>+     movh        [r0], m0
>+     movh        [r0 + 8], m1

movh+movhps+mova?
>+     mova        m2, [r0]

we may reduce a memory operator after above modify
>+
>+     ; row 2 & 3
>+     movh        m0, [r1 + r2 * 2]
>+     lea         r2, [r2 * 3]
>+     movh        m1, [r1 + r2]
>+     movh        [r0 + 16], m0
>+     movh        [r0 + 24], m1
>+ 
>+     mova        m0, [r0 + 16]
>+     packsswb    m2, m0
>+     pcmpeqb     m2, m4
>+
>+     ; get count
>+     ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
>+%if 0
>+     pmovmskb    eax, m2
>+     not         ax
>+     popcnt      ax, ax
> %else
>-    movhlps     m3, m2
>-    paddw       m2, m3
>-
>-    mova        m3, [pw_4]
>-    paddw       m3, m2
>-    psadbw      m3, m4
>-
>-    movd        eax, m3
>-%endif
>-    RET
>+     mova        m0, [pb_1]
>+     paddb       m2, m0

paddb m2, [pb_1]
>+     psadbw      m2, m4
>+     pshufd      m0, m2, 2
>+     paddw       m2, m0
>+     movd        eax, m2
>+ %endif
>+     RET
> 
> 
> INIT_YMM avx2
>@@ -4023,71 +4022,65 @@
> 
> 
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140903/e1d476f3/attachment-0001.html>


More information about the x265-devel mailing list