[x264-devel] x86: SSSE3 implementation of pixel_sad_x3 and pixel_sad_x4

Henrik Gramner git at videolan.org
Fri Aug 23 23:06:32 CEST 2013


x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Fri Jul  5 21:15:54 2013 +0200| [0a6825c6fe9ee5d2ded73e8c43be3e6dfd6a7658] | committer: Jason Garrett-Glaser

x86: SSSE3 implementation of pixel_sad_x3 and pixel_sad_x4

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=0a6825c6fe9ee5d2ded73e8c43be3e6dfd6a7658
---

 common/pixel.c       |    5 ++---
 common/x86/sad-a.asm |   26 ++++++++++++++++++++++----
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/common/pixel.c b/common/pixel.c
index cd799c3..e17d921 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1195,9 +1195,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         }
         else
         {
-            pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_ssse3;
-            pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_ssse3;
-            pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_ssse3;
+            INIT2( sad_x3, _ssse3 );
+            INIT5( sad_x4, _ssse3 );
         }
         if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) )
         {
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 1fad08a..7c5a155 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -1241,21 +1241,34 @@ SAD_X 4,  4,  4
 %endmacro
 
 %macro SAD_X3_END_SSE2 0
+    movifnidn r5, r5mp
+%if cpuflag(ssse3)
+    packssdw m0, m1
+    packssdw m2, m2
+    phaddd   m0, m2
+    mova   [r5], m0
+%else
     movhlps  m3, m0
     movhlps  m4, m1
     movhlps  m5, m2
     paddw    m0, m3
     paddw    m1, m4
     paddw    m2, m5
-    movifnidn r5, r5mp
     movd [r5+0], m0
     movd [r5+4], m1
     movd [r5+8], m2
+%endif
     RET
 %endmacro
 
 %macro SAD_X4_END_SSE2 0
     mov      r0, r6mp
+%if cpuflag(ssse3)
+    packssdw m0, m1
+    packssdw m2, m3
+    phaddd   m0, m2
+    mova   [r0], m0
+%else
     psllq    m1, 32
     psllq    m3, 32
     paddw    m0, m1
@@ -1266,6 +1279,7 @@ SAD_X 4,  4,  4
     paddw    m2, m3
     movq [r0+0], m0
     movq [r0+8], m2
+%endif
     RET
 %endmacro
 
@@ -1504,9 +1518,13 @@ cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8
 %endmacro
 
 INIT_XMM ssse3
-SAD_X_SSSE3 4, 8, 16
-SAD_X_SSSE3 4, 8,  8
-SAD_X_SSSE3 4, 8,  4
+SAD_X_SSE2  3, 16, 16, 7
+SAD_X_SSE2  3, 16,  8, 7
+SAD_X_SSE2  4, 16, 16, 7
+SAD_X_SSE2  4, 16,  8, 7
+SAD_X_SSSE3 4,  8, 16
+SAD_X_SSSE3 4,  8,  8
+SAD_X_SSSE3 4,  8,  4
 
 INIT_XMM avx
 SAD_X_SSE2 3, 16, 16, 6



More information about the x264-devel mailing list