[x265] [PATCH] asm: assembly code for pixel_sse_ss_4x4

chen chenm003 at 163.com
Fri Nov 22 16:21:46 CET 2013


>+;-----------------------------------------------------------------------------
>+; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
>+;-----------------------------------------------------------------------------
>+
>+%macro HEVC_SSD_SS 0
>+cglobal pixel_ssd_ss_4x4, 4,7,6
>+    pxor    m0, m0
can be remove
 
>+    pmovsxwd  m1, [r0]
>+    pmovsxwd  m2, [r2]
>+    psubd     m1, m2

>+    pmulld    m1, m1
>+    paddd     m0, m1
use pmulld dest reg is better
 
>+    lea     r0, [r0 + r1*2]
>+    lea     r2, [r2 + r3*2]
code ident

>+    pmovsxwd  m1, [r0]
>+    pmovsxwd  m2, [r2]
>+    psubd     m1, m2
>+    pmulld    m1, m1
>+    paddd     m0, m1
>+    lea     r0, [r0 + r1*2]
>+    lea     r2, [r2 + r3*2]
>+    pmovsxwd  m1, [r0]
>+    pmovsxwd  m2, [r2]
>+    psubd     m1, m2
>+    pmulld    m1, m1
>+    paddd     m0, m1
>+    lea     r0, [r0 + r1*2]
>+    lea     r2, [r2 + r3*2]
>+    pmovsxwd  m1, [r0]
>+    pmovsxwd  m2, [r2]
>+    psubd     m1, m2
>+    pmulld    m1, m1
>+    paddd     m0, m1
>+    phaddd  m0, m0
>+    phaddd  m0, m0
>+    movd   eax, m0
>+    RET
>+%endmacro
>+
> %if HIGH_BIT_DEPTH == 0
> %macro SSD_LOAD_FULL 5
>     mova      m1, [t0+%1]
>@@ -512,12 +551,17 @@
> %define SSD_CORE SSD_CORE_SSE2
> %define JOIN JOIN_SSE2
> HEVC_SSD
>+HEVC_SSD_SS
> INIT_XMM ssse3
> %define SSD_CORE SSD_CORE_SSSE3
> %define JOIN JOIN_SSSE3
> HEVC_SSD
>+HEVC_SSD_SS
>+INIT_XMM sse4
>+HEVC_SSD_SS
> INIT_XMM avx
> HEVC_SSD
>+HEVC_SSD_SS
> INIT_MMX ssse3
> SSD  4,  4
> SSD  4,  8
>diff -r d2173ec27a15 -r 98bcf33302ef source/common/x86/pixel.h
>--- a/source/common/x86/pixel.h Thu Nov 21 20:16:39 2013 +0530
>+++ b/source/common/x86/pixel.h Fri Nov 22 18:57:18 2013 +0530
>@@ -59,6 +59,9 @@
> #define DECL_X1(name, suffix) \
>     DECL_PIXELS(int, name, suffix, (pixel *, intptr_t, pixel *, intptr_t))
> 
>+#define DECL_X1_SS(name, suffix) \
>+    DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, int16_t *, intptr_t))
>+
> #define DECL_X4(name, suffix) \
>     DECL_PIXELS(void, name ## _x3, suffix, (pixel *, pixel *, pixel *, pixel *, intptr_t, int *)) \
>     DECL_PIXELS(void, name ## _x4, suffix, (pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int *))
>@@ -86,6 +89,15 @@
> DECL_X1(ssd, avx)
> DECL_X1(ssd, xop)
> DECL_X1(ssd, avx2)
>+DECL_X1_SS(ssd_ss, mmx)
>+DECL_X1_SS(ssd_ss, mmx2)
>+DECL_X1_SS(ssd_ss, sse2slow)
>+DECL_X1_SS(ssd_ss, sse2)
>+DECL_X1_SS(ssd_ss, ssse3)
>+DECL_X1_SS(ssd_ss, sse4)
>+DECL_X1_SS(ssd_ss, avx)
>+DECL_X1_SS(ssd_ss, xop)
>+DECL_X1_SS(ssd_ss, avx2)
> DECL_X1(satd, mmx2)
> DECL_X1(satd, sse2)
> DECL_X1(satd, ssse3)
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131122/e8799de9/attachment.html>


More information about the x265-devel mailing list