[x265] [PATCH Review only] asm: code for scale2D_64to32 routine

chen chenm003 at 163.com
Thu Nov 14 16:06:08 CET 2013


I give you some algorithm details here:
In:
A B
C D
Out:
(A + B + C + D + 2) / 4
This is standard MPEG4 interpolateHV, you may reference Xvid's code
or use pmaddubsw + pmulhrsw

>+;-----------------------------------------------------------------
>+; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
>+;-----------------------------------------------------------------
>+INIT_XMM ssse3
>+cglobal scale2D_64to32, 3, 7, 8, dest, src, stride
>+
>+    mova        m7,      [pw_00ff]
>+    mova        m6,      [pw_2]
>+    xor         r3,      r3
>+    mov         r6d,     32
>+.loop
>+
>+    mov         r4,      r3
>+    imul        r4,      r2
>+
>+    mov         r5,      r3
>+    inc         r5
>+    imul        r5,      r2
>+
>+    movu        m0,      [r1 + r4]
>+    palignr     m1,      m0,    1
>+    movu        m2,      [r1 + r5]
>+    palignr     m3,      m2,    1
>+
>+    pand        m0,      m7
>+    pand        m1,      m7
>+    pand        m2,      m7
>+    pand        m3,      m7
>+
>+    paddusw       m0,      m1
>+    paddusw       m0,      m2
>+    paddusw       m0,      m3
>+    paddusw       m0,      m6
>+
>+    psrlw       m0,      2
>+
>+    movu        m4,      [r1 + r4 + 16]
>+    palignr     m5,      m4,    1
>+    movu        m1,      [r1 + r5 + 16]
>+    palignr     m2,      m1,    1
>+
>+    pand        m4,      m7
>+    pand        m5,      m7
>+    pand        m1,      m7
>+    pand        m2,      m7
>+
>+    paddusw       m4,      m5
>+    paddusw       m4,      m1
>+    paddusw       m4,      m2
>+    paddusw       m4,      m6
>+    psrlw         m4,      2
>+
>+    packuswb    m0,      m4
>+    movu        [r0],    m0
>+
>+    movu        m0,      [r1 + r4 + 32]
>+    palignr     m1,      m0,    1
>+    movu        m2,      [r1 + r5 + 32]
>+    palignr     m3,      m2,    1
>+
>+    pand        m0,      m7
>+    pand        m1,      m7
>+    pand        m2,      m7
>+    pand        m3,      m7
>+
>+    paddusw       m0,      m1
>+    paddusw       m0,      m2
>+    paddusw       m0,      m3
>+    paddusw       m0,      m6
>+
>+    psrlw       m0,      2
>+
>+    movu        m4,      [r1 + r4 + 48]
>+    palignr     m5,      m4,    1
>+    movu        m1,      [r1 + r5 + 48]
>+    palignr     m2,      m1,    1
>+
>+    pand        m4,      m7
>+    pand        m5,      m7
>+    pand        m1,      m7
>+    pand        m2,      m7
>+
>+    paddusw       m4,      m5
>+    paddusw       m4,      m1
>+    paddusw       m4,      m2
>+    paddusw       m4,      m6
>+    psrlw         m4,      2
>+
>+    packuswb    m0,           m4
>+    movu        [r0 + 16],    m0
>+
>+    lea    r0,    [r0 + 32]
>+    add    r3,    2
>+    dec    r6d
>+
>+    jnz    .loop
>+
>+RET
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131114/25b9ad70/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: interpolate8x8_xmm.asm
Type: application/octet-stream
Size: 19010 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131114/25b9ad70/attachment-0001.obj>


More information about the x265-devel mailing list