[x264-devel] x86: SSSE3 implementation of pixel_sad_x3 and pixel_sad_x4
Henrik Gramner
git at videolan.org
Fri Aug 23 23:06:32 CEST 2013
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Fri Jul 5 21:15:54 2013 +0200| [0a6825c6fe9ee5d2ded73e8c43be3e6dfd6a7658] | committer: Jason Garrett-Glaser
x86: SSSE3 implementation of pixel_sad_x3 and pixel_sad_x4
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=0a6825c6fe9ee5d2ded73e8c43be3e6dfd6a7658
---
common/pixel.c | 5 ++---
common/x86/sad-a.asm | 26 ++++++++++++++++++++++----
2 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/common/pixel.c b/common/pixel.c
index cd799c3..e17d921 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1195,9 +1195,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
else
{
- pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_ssse3;
- pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_ssse3;
- pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_ssse3;
+ INIT2( sad_x3, _ssse3 );
+ INIT5( sad_x4, _ssse3 );
}
if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) )
{
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 1fad08a..7c5a155 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -1241,21 +1241,34 @@ SAD_X 4, 4, 4
%endmacro
%macro SAD_X3_END_SSE2 0
+ movifnidn r5, r5mp
+%if cpuflag(ssse3)
+ packssdw m0, m1
+ packssdw m2, m2
+ phaddd m0, m2
+ mova [r5], m0
+%else
movhlps m3, m0
movhlps m4, m1
movhlps m5, m2
paddw m0, m3
paddw m1, m4
paddw m2, m5
- movifnidn r5, r5mp
movd [r5+0], m0
movd [r5+4], m1
movd [r5+8], m2
+%endif
RET
%endmacro
%macro SAD_X4_END_SSE2 0
mov r0, r6mp
+%if cpuflag(ssse3)
+ packssdw m0, m1
+ packssdw m2, m3
+ phaddd m0, m2
+ mova [r0], m0
+%else
psllq m1, 32
psllq m3, 32
paddw m0, m1
@@ -1266,6 +1279,7 @@ SAD_X 4, 4, 4
paddw m2, m3
movq [r0+0], m0
movq [r0+8], m2
+%endif
RET
%endmacro
@@ -1504,9 +1518,13 @@ cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8
%endmacro
INIT_XMM ssse3
-SAD_X_SSSE3 4, 8, 16
-SAD_X_SSSE3 4, 8, 8
-SAD_X_SSSE3 4, 8, 4
+SAD_X_SSE2 3, 16, 16, 7
+SAD_X_SSE2 3, 16, 8, 7
+SAD_X_SSE2 4, 16, 16, 7
+SAD_X_SSE2 4, 16, 8, 7
+SAD_X_SSSE3 4, 8, 16
+SAD_X_SSSE3 4, 8, 8
+SAD_X_SSSE3 4, 8, 4
INIT_XMM avx
SAD_X_SSE2 3, 16, 16, 6
More information about the x264-devel
mailing list