[x265] [PATCH Review only] asm: code for sse_pp_12x16 routine

chen chenm003 at 163.com
Thu Nov 21 16:49:50 CET 2013


> ;-----------------------------------------------------------------------------
>+; int pixel_ssd_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
>+;-----------------------------------------------------------------------------
>+INIT_XMM ssse3
>+cglobal pixel_ssd_12x16, 4, 5, 7, src1, stride1, src2, stride2
>+
>+    pxor        m6,     m6
>+    mov         r4d,    4
>+
>+.loop
>+    movu        m0,    [r0]
>+    movu        m1,    [r2]
>+    movu        m2,    [r0 + r1]
>+    movu        m3,    [r2 + r3]
>+
>+    mova        m4,    m0
>+    mova        m5,    m1
>+    punpckhdq   m4,    m2
>+    punpckhdq   m5,    m3
punpckhdq m4, m0, m2
on AVX and later it is faster
 
>+
>+    pmovzxbw    m0,    m0
>+    pmovzxbw    m1,    m1
>+    pmovzxbw    m2,    m2
>+    pmovzxbw    m3,    m3
>+    pmovzxbw    m4,    m4
>+    pmovzxbw    m5,    m5
>+
>+    psubw       m0,    m1
>+    psubw       m2,    m3
>+    psubw       m4,    m5
>+
>+    pmaddwd     m0,    m0
>+    pmaddwd     m2,    m2
>+    pmaddwd     m4,    m4
>+
>+    paddd       m6,    m0
>+    paddd       m6,    m2
>+    paddd       m6,    m4
paddd m0, m2
paddd m6, m4
paddd m6, m0

>+    movu        m0,    [r0 + 2 * r1]
>+    movu        m1,    [r2 + 2 * r3]
>+    lea         r0,    [r0 + 2 * r1]
>+    lea         r2,    [r2 + 2 * r3]
>+    movu        m2,    [r0 + r1]
>+    movu        m3,    [r2 + r3]
>+
>+    mova        m4,    m0
>+    mova        m5,    m1
>+    punpckhdq   m4,    m2
>+    punpckhdq   m5,    m3
>+
>+    pmovzxbw    m0,    m0
>+    pmovzxbw    m1,    m1
>+    pmovzxbw    m2,    m2
>+    pmovzxbw    m3,    m3
>+    pmovzxbw    m4,    m4
>+    pmovzxbw    m5,    m5
>+
>+    psubw       m0,    m1
>+    psubw       m2,    m3
>+    psubw       m4,    m5
>+
>+    pmaddwd     m0,    m0
>+    pmaddwd     m2,    m2
>+    pmaddwd     m4,    m4
>+
>+    paddd       m6,    m0
>+    paddd       m6,    m2
>+    paddd       m6,    m4
>+
>+    lea       r0,                    [r0 + 2 * r1]
>+    lea       r2,                    [r2 + 2 * r3]
>+    dec    r4d
>+    jnz    .loop
dec may move front to hidden flag register latency
 
>+
>+    HADDD   m6, m1
>+    movd   eax, m6
>+
>+    RET
>+
>+;-----------------------------------------------------------------------------
> ; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
> ;                           int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
> ;
>diff -r 44d15f8ce940 -r 289b23f3ec9e source/common/x86/pixel.h
>--- a/source/common/x86/pixel.h Thu Nov 21 14:44:06 2013 +0530
>+++ b/source/common/x86/pixel.h Thu Nov 21 20:18:13 2013 +0530
>@@ -372,5 +372,6 @@
> uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
> void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);
> void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
>+int x265_pixel_ssd_12x16_ssse3(pixel *, intptr_t, pixel *, intptr_t);
> 
> #endif // ifndef X265_I386_PIXEL_H
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131121/0f09ef55/attachment.html>


More information about the x265-devel mailing list