[x265] [PATCH] asm: assembly code for pixel_sse_ss_4x4
chen
chenm003 at 163.com
Fri Nov 22 16:21:46 CET 2013
>+;-----------------------------------------------------------------------------
>+; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
>+;-----------------------------------------------------------------------------
>+
>+%macro HEVC_SSD_SS 0
>+cglobal pixel_ssd_ss_4x4, 4,7,6
>+ pxor m0, m0
can be remove
>+ pmovsxwd m1, [r0]
>+ pmovsxwd m2, [r2]
>+ psubd m1, m2
>+ pmulld m1, m1
>+ paddd m0, m1
use pmulld dest reg is better
>+ lea r0, [r0 + r1*2]
>+ lea r2, [r2 + r3*2]
code ident
>+ pmovsxwd m1, [r0]
>+ pmovsxwd m2, [r2]
>+ psubd m1, m2
>+ pmulld m1, m1
>+ paddd m0, m1
>+ lea r0, [r0 + r1*2]
>+ lea r2, [r2 + r3*2]
>+ pmovsxwd m1, [r0]
>+ pmovsxwd m2, [r2]
>+ psubd m1, m2
>+ pmulld m1, m1
>+ paddd m0, m1
>+ lea r0, [r0 + r1*2]
>+ lea r2, [r2 + r3*2]
>+ pmovsxwd m1, [r0]
>+ pmovsxwd m2, [r2]
>+ psubd m1, m2
>+ pmulld m1, m1
>+ paddd m0, m1
>+ phaddd m0, m0
>+ phaddd m0, m0
>+ movd eax, m0
>+ RET
>+%endmacro
>+
> %if HIGH_BIT_DEPTH == 0
> %macro SSD_LOAD_FULL 5
> mova m1, [t0+%1]
>@@ -512,12 +551,17 @@
> %define SSD_CORE SSD_CORE_SSE2
> %define JOIN JOIN_SSE2
> HEVC_SSD
>+HEVC_SSD_SS
> INIT_XMM ssse3
> %define SSD_CORE SSD_CORE_SSSE3
> %define JOIN JOIN_SSSE3
> HEVC_SSD
>+HEVC_SSD_SS
>+INIT_XMM sse4
>+HEVC_SSD_SS
> INIT_XMM avx
> HEVC_SSD
>+HEVC_SSD_SS
> INIT_MMX ssse3
> SSD 4, 4
> SSD 4, 8
>diff -r d2173ec27a15 -r 98bcf33302ef source/common/x86/pixel.h
>--- a/source/common/x86/pixel.h Thu Nov 21 20:16:39 2013 +0530
>+++ b/source/common/x86/pixel.h Fri Nov 22 18:57:18 2013 +0530
>@@ -59,6 +59,9 @@
> #define DECL_X1(name, suffix) \
> DECL_PIXELS(int, name, suffix, (pixel *, intptr_t, pixel *, intptr_t))
>
>+#define DECL_X1_SS(name, suffix) \
>+ DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, int16_t *, intptr_t))
>+
> #define DECL_X4(name, suffix) \
> DECL_PIXELS(void, name ## _x3, suffix, (pixel *, pixel *, pixel *, pixel *, intptr_t, int *)) \
> DECL_PIXELS(void, name ## _x4, suffix, (pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int *))
>@@ -86,6 +89,15 @@
> DECL_X1(ssd, avx)
> DECL_X1(ssd, xop)
> DECL_X1(ssd, avx2)
>+DECL_X1_SS(ssd_ss, mmx)
>+DECL_X1_SS(ssd_ss, mmx2)
>+DECL_X1_SS(ssd_ss, sse2slow)
>+DECL_X1_SS(ssd_ss, sse2)
>+DECL_X1_SS(ssd_ss, ssse3)
>+DECL_X1_SS(ssd_ss, sse4)
>+DECL_X1_SS(ssd_ss, avx)
>+DECL_X1_SS(ssd_ss, xop)
>+DECL_X1_SS(ssd_ss, avx2)
> DECL_X1(satd, mmx2)
> DECL_X1(satd, sse2)
> DECL_X1(satd, ssse3)
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131122/e8799de9/attachment.html>
More information about the x265-devel
mailing list