[x265] [PATCH] saoCuOrgE0 asm code, improved 500.43 -> 466.58

chen chenm003 at 163.com
Tue Dec 30 14:51:10 CET 2014


right


At 2014-12-30 21:45:20,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1419246771 -19800
># Node ID ed7405ed73e062a32c1413f8868d5f73edde79ac
># Parent  32ed3f21039a5b93a54da8961442825e4db69d88
>saoCuOrgE0 asm code, improved 500.43 -> 466.58
>
>diff -r 32ed3f21039a -r ed7405ed73e0 source/common/x86/const-a.asm
>--- a/source/common/x86/const-a.asm	Mon Dec 29 13:49:02 2014 +0530
>+++ b/source/common/x86/const-a.asm	Mon Dec 22 16:42:51 2014 +0530
>@@ -50,6 +50,7 @@
> const pb_unpackwq2, db 4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7
> const pw_swap,      times 2 db 6,7,4,5,2,3,0,1
> 
>+const pb_2,        times 16 db 2
> const pb_4,        times 16 db 4
> const pb_16,       times 16 db 16
> const pb_64,       times 16 db 64
>diff -r 32ed3f21039a -r ed7405ed73e0 source/common/x86/loopfilter.asm
>--- a/source/common/x86/loopfilter.asm	Mon Dec 29 13:49:02 2014 +0530
>+++ b/source/common/x86/loopfilter.asm	Mon Dec 22 16:42:51 2014 +0530
>@@ -29,9 +29,12 @@
> 
> SECTION_RODATA 32
> 
>-pw_2:    times 16 db  2
> 
> SECTION .text
>+cextern pb_1
>+cextern pb_128
>+cextern pb_2
>+
> 
> ;============================================================================================================
> ; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t signLeft)
>@@ -39,47 +42,44 @@
> INIT_XMM sse4
> cglobal saoCuOrgE0, 4, 4, 8, rec, offsetEo, lcuWidth, signLeft
> 
>-    neg         r3                 ; r3 = -iSignLeft
>-    movd        m0,    r3d
>-    pslldq      m0,    15          ; m0 = [iSignLeft x .. x]
>-    pcmpeqb     m4,    m4          ; m4 = [pb -1]
>-    pxor        m5,    m5          ; m5 = 0
>-    movh        m6,    [r1]        ; m6 = m_offsetEo
>+    neg         r3                          ; r3 = -signLeft
>+    movzx       r3d, r3b
>+    movd        m0, r3d
>+    mova        m4, [pb_128]                ; m4 = [80]
>+    pxor        m5, m5                      ; m5 = 0
>+    movu        m6, [r1]                    ; m6 = offsetEo
> 
> .loop:
>-    movu        m7,    [r0]        ; m1 = pRec[x]
>-    mova        m1,    m7
>-    movu        m2,    [r0+1]      ; m2 = pRec[x+1]
>+    movu        m7, [r0]                    ; m1 = rec[x]
>+    movu        m2, [r0 + 1]                ; m2 = rec[x+1]
> 
>-    psubusb     m3,    m2, m7
>-    psubusb     m1,    m2
>-    pcmpeqb     m3,    m5
>-    pcmpeqb     m1,    m5
>-    pcmpeqb     m2,    m7
>+    pxor        m1, m7, m4
>+    pxor        m3, m2, m4
>+    pcmpgtb     m2, m1, m3
>+    pcmpgtb     m3, m1
>+    pand        m2, [pb_1]
>+    por         m2, m3
> 
>-    pabsb       m3,    m3          ; m1 = (pRec[x] - pRec[x+1]) > 0) ?  1 : 0
>-    por         m1,    m3          ; m1 = iSignRight
>-    pandn       m2,    m1
>+    pslldq      m3, m2, 1
>+    por         m3, m0
> 
>-    palignr     m3,    m2, m0, 15  ; m3 = -iSignLeft
>-    psignb      m3,    m4          ; m3 = iSignLeft
>-    mova        m0,    m4
>-    pslldq      m0,    15
>-    pand        m0,    m2          ; [pb 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1]
>-    paddb       m2,    m3
>-    paddb       m2,    [pw_2]      ; m1 = uiEdgeType
>-    pshufb      m3,    m6, m2
>-    pmovzxbw    m2,    m7          ; rec
>-    punpckhbw   m7,    m5
>-    pmovsxbw    m1,    m3          ; iOffsetEo
>-    punpckhbw   m3,    m3
>-    psraw       m3,    8
>-    paddw       m2,    m1
>-    paddw       m7,    m3
>-    packuswb    m2,    m7
>-    movu        [r0],  m2
>+    psignb      m3, m4                      ; m3 = signLeft
>+    pxor        m0, m0
>+    palignr     m0, m2, 15
>+    paddb       m2, m3
>+    paddb       m2, [pb_2]                  ; m1 = uiEdgeType
>+    pshufb      m3, m6, m2
>+    pmovzxbw    m2, m7                      ; rec
>+    punpckhbw   m7, m5
>+    pmovsxbw    m1, m3                      ; offsetEo
>+    punpckhbw   m3, m3
>+    psraw       m3, 8
>+    paddw       m2, m1
>+    paddw       m7, m3
>+    packuswb    m2, m7
>+    movu        [r0], m2
> 
>-    add         r0q,   16
>-    sub         r2d,   16
>+    add         r0q, 16
>+    sub         r2d, 16
>     jnz        .loop
>     RET
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141230/2a537fab/attachment-0001.html>


More information about the x265-devel mailing list