[x264-devel] [Alexander Izvorski <aizvorski at gmail.com>] Re: [patch] SSE2 pixel routines
System administration
admin at via.ecp.fr
Wed Jul 27 18:44:47 CEST 2005
The deleted attachment is at:
<http://www.videolan.org/~admin/20050727-videolan/sse2-pixel-routines-v4.diff>
----- Forwarded message from Alexander Izvorski <aizvorski at gmail.com> -----
From: Alexander Izvorski <aizvorski at gmail.com>
Date: Tue, 26 Jul 2005 20:30:54 -0700
To: x264-devel at videolan.org
Subject: Re: [patch] SSE2 pixel routines
Reply-To: Alexander Izvorski <aizvorski at gmail.com>
X-Spam-Status: No, score=-8.4 required=5.0 tests=IN_REP_TO,RCVD_BY_IP,
RCVD_IN_ORBS,UNIFIED_PATCH autolearn=failed version=3.0.3
Hello,
The SSE2 patch is getting there, I'd like to propose this version as a
candidate to be committed. It is the same speed or slightly slower
on Athlon64, but noticeably faster on P4 or Xeon. Some benchmark
results are here: http://www.firstmiletv.nl/vlc/x264/ (courtesy of
Trax).
It should be theoretically possible to speed it up on the Athlon64 as
well but I don't expect to make any progress on that without having
such a box available. It would be nice to have it off by default on
Athlon64 until it is actually faster there as well, but I'm not sure
how to best do that.
I reorganized it into a separate file, would you prefer that or would
you rather have it in the same file?
Regards,
-Alex Izvorski
Index: Makefile
===================================================================
--- Makefile (revision 280)
+++ Makefile (working copy)
@@ -19,7 +19,8 @@
SRCS += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c
ASMSRC = common/i386/dct-a.asm common/i386/cpu-a.asm \
common/i386/pixel-a.asm common/i386/mc-a.asm \
- common/i386/mc-a2.asm common/i386/predict-a.asm
+ common/i386/mc-a2.asm common/i386/predict-a.asm \
+ common/i386/pixel-sse2-a.asm
OBJASM = $(ASMSRC:%.asm=%.o)
endif
Index: common/pixel.c
===================================================================
--- common/pixel.c (revision 280)
+++ common/pixel.c (working copy)
@@ -431,6 +431,24 @@
pixf->satd[PIXEL_4x4] = x264_pixel_satd_4x4_mmxext;
}
#endif
+
+#ifdef HAVE_SSE2
+ if( cpu&X264_CPU_SSE2 )
+ {
+ pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_sse2;
+ pixf->sad[PIXEL_16x8 ] = x264_pixel_sad_16x8_sse2;
+
+ pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_sse2;
+ pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_sse2;
+
+ pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_sse2;
+ pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_sse2;
+ pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_sse2;
+ pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_sse2;
+ pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_sse2;
+ }
+#endif
+
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC )
{
Index: common/i386/pixel.h
===================================================================
--- common/i386/pixel.h (revision 280)
+++ common/i386/pixel.h (working copy)
@@ -48,4 +48,18 @@
int x264_pixel_satd_4x8_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int );
+#ifdef HAVE_SSE2
+int x264_pixel_sad_16x16_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_16x8_sse2( uint8_t *, int, uint8_t *, int );
+
+int x264_pixel_ssd_16x16_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_16x8_sse2( uint8_t *, int, uint8_t *, int );
+
+int x264_pixel_satd_16x16_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_16x8_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x16_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x8_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x4_sse2( uint8_t *, int, uint8_t *, int );
#endif
+
+#endif
Index: common/i386/pixel-sse2-a.asm
===================================================================
--- common/i386/pixel-sse2-a.asm (revision 0)
+++ common/i386/pixel-sse2-a.asm (revision 0)
@@ -0,0 +1,601 @@
+;*****************************************************************************
+;* pixel.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2005 x264 project
+;* $Id: $
+;*
+;* Authors: Alex Izvorski <aizvorksi at gmail.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+BITS 32
+
+;=============================================================================
+; Macros and other preprocessor constants
+;=============================================================================
+
+%macro cglobal 1
+ %ifdef PREFIX
+ global _%1
+ %define %1 _%1
+ %else
+ global %1
+ %endif
+%endmacro
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+
+cglobal x264_pixel_sad_16x16_sse2
+cglobal x264_pixel_sad_16x8_sse2
+cglobal x264_pixel_ssd_16x16_sse2
+cglobal x264_pixel_ssd_16x8_sse2
+cglobal x264_pixel_satd_8x4_sse2
+cglobal x264_pixel_satd_8x8_sse2
+cglobal x264_pixel_satd_16x8_sse2
+cglobal x264_pixel_satd_8x16_sse2
+cglobal x264_pixel_satd_16x16_sse2
+
+%macro SAD_INC_2x16P_SSE2 0
+ movdqu xmm1, [eax]
+ movdqu xmm2, [eax+ebx]
+ movdqu xmm3, [ecx]
+ movdqu xmm4, [ecx+edx]
+ psadbw xmm1, xmm3
+ psadbw xmm2, xmm4
+ lea eax, [eax+2*ebx]
+ paddw xmm1, xmm2
+ lea ecx, [ecx+2*edx]
+ paddw xmm0, xmm1
+%endmacro
+
+%macro SAD_INC_4x16P_SSE2 0
+ movdqu xmm1, [ecx]
+ movdqu xmm2, [ecx+ebx]
+ lea ecx, [ecx+2*edx]
+ movdqu xmm3, [ecx]
+ movdqu xmm4, [ecx+edx]
+ psadbw xmm1, [eax]
+ psadbw xmm2, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ psadbw xmm3, [eax]
+ psadbw xmm4, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ lea ecx, [ecx+2*edx]
+ paddw xmm1, xmm2
+ paddw xmm3, xmm4
+ paddw xmm0, xmm1
+ paddw xmm0, xmm3
+%endmacro
+
+%macro SAD_START_SSE2 0
+ push ebx
+
+ mov eax, [esp+ 8] ; pix1
+ mov ebx, [esp+12] ; stride1
+ mov ecx, [esp+16] ; pix2
+ mov edx, [esp+20] ; stride2
+
+ pxor xmm0, xmm0
+%endmacro
+
+%macro SAD_END_SSE2 0
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddw xmm0, xmm1
+ movd eax, xmm0
+
+ pop ebx
+ ret
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_16x16_sse2:
+ SAD_START_SSE2
+ SAD_INC_4x16P_SSE2
+ SAD_INC_4x16P_SSE2
+ SAD_INC_4x16P_SSE2
+ SAD_INC_4x16P_SSE2
+ SAD_END_SSE2
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_16x8_sse2:
+ SAD_START_SSE2
+ SAD_INC_4x16P_SSE2
+ SAD_INC_4x16P_SSE2
+ SAD_END_SSE2
+
+%macro SSD_INC_1x16P_SSE2 0
+ movdqu xmm1, [eax]
+ movdqu xmm2, [ecx]
+
+ movdqa xmm5, xmm1
+ psubusb xmm1, xmm2
+ psubusb xmm2, xmm5
+ por xmm1, xmm2
+
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm7
+ punpckhbw xmm2, xmm7
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+
+ add eax, ebx
+ add ecx, edx
+ paddd xmm0, xmm1
+ paddd xmm0, xmm2
+%endmacro
+
+%macro SSD_INC_2x16P_SSE2 0
+ movdqu xmm1, [eax]
+ movdqu xmm2, [ecx]
+ movdqu xmm3, [eax+ebx]
+ movdqu xmm4, [ecx+edx]
+
+ movdqa xmm5, xmm1
+ movdqa xmm6, xmm3
+ psubusb xmm1, xmm2
+ psubusb xmm3, xmm4
+ psubusb xmm2, xmm5
+ psubusb xmm4, xmm6
+ por xmm1, xmm2
+ por xmm3, xmm4
+
+ movdqa xmm2, xmm1
+ movdqa xmm4, xmm3
+ punpcklbw xmm1, xmm7
+ punpckhbw xmm2, xmm7
+ punpcklbw xmm3, xmm7
+ punpckhbw xmm4, xmm7
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+ pmaddwd xmm4, xmm4
+
+ lea eax, [eax+2*ebx]
+ lea ecx, [ecx+2*edx]
+
+ paddd xmm1, xmm2
+ paddd xmm3, xmm4
+ paddd xmm0, xmm1
+ paddd xmm0, xmm3
+%endmacro
+
+%macro SSD_INC_8x16P_SSE2 0
+ SSD_INC_2x16P_SSE2
+ SSD_INC_2x16P_SSE2
+ SSD_INC_2x16P_SSE2
+ SSD_INC_2x16P_SSE2
+%endmacro
+
+%macro SSD_START_SSE2 0
+ push ebx
+
+ mov eax, [esp+ 8] ; pix1
+ mov ebx, [esp+12] ; stride1
+ mov ecx, [esp+16] ; pix2
+ mov edx, [esp+20] ; stride2
+
+ pxor xmm7, xmm7 ; zero
+ pxor xmm0, xmm0 ; mm0 holds the sum
+%endmacro
+
+%macro SSD_END_SSE2 0
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddd xmm0, xmm1
+
+ movdqa xmm1, xmm0
+ psrldq xmm1, 4
+ paddd xmm0, xmm1
+ movd eax, xmm0
+
+ pop ebx
+ ret
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_ssd_16x16_sse2:
+ SSD_START_SSE2
+ SSD_INC_8x16P_SSE2
+ SSD_INC_8x16P_SSE2
+ SSD_END_SSE2
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_ssd_16x8_sse2:
+ SSD_START_SSE2
+ SSD_INC_8x16P_SSE2
+ SSD_END_SSE2
+
+; %1=(row2, row0) %2=(row3, row1) %3=junk
+; output in %1=(row3, row0) and %3=(row2, row1)
+%macro HADAMARD4x4_SSE2 3
+
+ movdqa %3, %1
+ paddw %1, %2
+ psubw %3, %2
+ movdqa %2, %1
+ punpcklqdq %1, %3
+ punpckhqdq %2, %3
+ movdqa %3, %1
+ paddw %1, %2
+ psubw %3, %2
+
+%endmacro
+
+;;; two HADAMARD4x4_SSE2 running side-by-side
+%macro HADAMARD4x4_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)
+ movdqa %3, %1
+ movdqa %6, %4
+ paddw %1, %2
+ paddw %4, %5
+ psubw %3, %2
+ psubw %6, %5
+ movdqa %2, %1
+ movdqa %5, %4
+ punpcklqdq %1, %3
+ punpcklqdq %4, %6
+ punpckhqdq %2, %3
+ punpckhqdq %5, %6
+ movdqa %3, %1
+ movdqa %6, %4
+ paddw %1, %2
+ paddw %4, %5
+ psubw %3, %2
+ psubw %6, %5
+%endmacro
+
+%macro TRANSPOSE4x4_TWIST_SSE2 3 ; %1=(row3, row0) %2=(row2, row1) %3=junk, output in %1 and %2
+
+ movdqa %3, %1
+ punpcklwd %1, %2
+ punpckhwd %2, %3 ; backwards because the high quadwords are already swapped
+
+ movdqa %3, %1
+ punpckldq %1, %2
+ punpckhdq %3, %2
+
+ movdqa %2, %1
+ punpcklqdq %1, %3
+ punpckhqdq %2, %3
+
+%endmacro
+
+;;; two TRANSPOSE4x4_TWIST_SSE2 running side-by-side
+%macro TRANSPOSE4x4_TWIST_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)
+ movdqa %3, %1
+ movdqa %6, %4
+ punpcklwd %1, %2
+ punpcklwd %4, %5
+ punpckhwd %2, %3
+ punpckhwd %5, %6
+ movdqa %3, %1
+ movdqa %6, %4
+ punpckldq %1, %2
+ punpckldq %4, %5
+ punpckhdq %3, %2
+ punpckhdq %6, %5
+ movdqa %2, %1
+ movdqa %5, %4
+ punpcklqdq %1, %3
+ punpcklqdq %4, %6
+ punpckhqdq %2, %3
+ punpckhqdq %5, %6
+%endmacro
+
+
+;;; loads the difference of two 4x4 blocks into xmm0,xmm1 and xmm4,xmm5 in interleaved-row order
+;;; destroys xmm2, 3 and 7
+%macro LOAD4x8_DIFF_SSE2 0
+
+ pxor xmm7, xmm7
+
+ movq xmm0, [eax]
+ movq xmm1, [eax+ebx]
+ lea eax, [eax+2*ebx]
+
+ movq xmm4, [ecx]
+ movq xmm5, [ecx+edx]
+ lea ecx, [ecx+2*edx]
+
+ movq xmm2, [eax]
+ movq xmm3, [eax+ebx]
+ lea eax, [eax+2*ebx]
+
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ psubw xmm0, xmm4
+ psubw xmm1, xmm5
+
+ movq xmm4, [ecx]
+ movq xmm5, [ecx+edx]
+ lea ecx, [ecx+2*edx]
+
+ punpcklbw xmm2, xmm7
+ punpcklbw xmm3, xmm7
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ psubw xmm2, xmm4
+ psubw xmm3, xmm5
+
+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm1
+ punpcklqdq xmm0, xmm2 ; rows 0 and 2
+ punpcklqdq xmm1, xmm3 ; rows 1 and 3
+ punpckhqdq xmm4, xmm2 ; next 4x4 rows 0 and 2
+ punpckhqdq xmm5, xmm3 ; next 4x4 rows 1 and 3
+
+%endmacro
+
+%macro SUM4x4_SSE2 4 ; 02 13 junk sum
+
+ pxor %3, %3
+ psubw %3, %1
+ pmaxsw %1, %3
+
+ pxor %3, %3
+ psubw %3, %2
+ pmaxsw %2, %3
+
+ paddusw %4, %1
+ paddusw %4, %2
+
+%endmacro
+
+;;; two SUM4x4_SSE2 running side-by-side
+%macro SUM4x4_TWO_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
+ pxor %3, %3
+ pxor %6, %6
+ psubw %3, %1
+ psubw %6, %4
+ pmaxsw %1, %3
+ pmaxsw %4, %6
+ pxor %3, %3
+ pxor %6, %6
+ psubw %3, %2
+ psubw %6, %5
+ pmaxsw %2, %3
+ pmaxsw %5, %6
+ paddusw %1, %2
+ paddusw %4, %5
+ paddusw %7, %1
+ paddusw %7, %4
+%endmacro
+
+%macro SUM_MM_SSE2 2 ; sum junk
+ movdqa %2, %1
+ psrldq %1, 8
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %1, 4
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %1, 2
+ paddusw %1, %2
+ movd eax,%1
+ and eax,0xffff
+ shr eax,1
+ pxor %1, %1 ; fixme - can save an instruction or two here
+%endmacro
+
+%macro HADAMARD_SSE2 4 ; 02 13 junk sum
+ HADAMARD4x4_SSE2 %1, %2, %3
+ TRANSPOSE4x4_TWIST_SSE2 %1, %3, %2
+ HADAMARD4x4_SSE2 %1, %3, %2
+ SUM4x4_SSE2 %1, %2, %3, %4
+%endmacro
+
+%macro HADAMARD_TWO_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
+ HADAMARD4x4_TWO_SSE2 %1, %2, %3, %4, %5, %6
+ TRANSPOSE4x4_TWIST_TWO_SSE2 %1, %3, %2, %4, %6, %5
+ HADAMARD4x4_TWO_SSE2 %1, %3, %2, %4, %6, %5
+ SUM4x4_TWO_SSE2 %1, %2, %3, %4, %5, %6, %7
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_16x16_sse2:
+ push ebx
+ push ebp
+
+ mov eax, [esp+12] ; pix1
+ mov ebx, [esp+16] ; stride1
+ mov ecx, [esp+20] ; pix2
+ mov edx, [esp+24] ; stride2
+
+ pxor xmm6, xmm6
+ xor ebp, ebp
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ SUM_MM_SSE2 xmm6, xmm7
+ add ebp, eax
+
+ mov eax, [esp+12]
+ mov ecx, [esp+20]
+ lea eax, [eax+8]
+ lea ecx, [ecx+8]
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ SUM_MM_SSE2 xmm6, xmm7
+ add ebp, eax
+ mov eax, ebp
+
+ pop ebp
+ pop ebx
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x16_sse2:
+ push ebx
+ push ebp
+
+ mov eax, [esp+12] ; pix1
+ mov ebx, [esp+16] ; stride1
+ mov ecx, [esp+20] ; pix2
+ mov edx, [esp+24] ; stride2
+
+ pxor xmm6, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ SUM_MM_SSE2 xmm6, xmm7
+
+ pop ebp
+ pop ebx
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_16x8_sse2:
+ push ebx
+ push ebp
+
+ mov eax, [esp+12] ; pix1
+ mov ebx, [esp+16] ; stride1
+ mov ecx, [esp+20] ; pix2
+ mov edx, [esp+24] ; stride2
+
+ pxor xmm6, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ mov eax, [esp+12]
+ mov ecx, [esp+20]
+ lea eax, [eax+8]
+ lea ecx, [ecx+8]
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ SUM_MM_SSE2 xmm6, xmm7
+
+ pop ebp
+ pop ebx
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x8_sse2:
+ push ebx
+
+ mov eax, [esp+ 8] ; pix1
+ mov ebx, [esp+12] ; stride1
+ mov ecx, [esp+16] ; pix2
+ mov edx, [esp+20] ; stride2
+
+ pxor xmm6, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ SUM_MM_SSE2 xmm6, xmm7
+
+ pop ebx
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_satd_8x4_sse2:
+ push ebx
+
+ mov eax, [esp+ 8] ; pix1
+ mov ebx, [esp+12] ; stride1
+ mov ecx, [esp+16] ; pix2
+ mov edx, [esp+20] ; stride2
+
+ pxor xmm6, xmm6
+
+ LOAD4x8_DIFF_SSE2
+ HADAMARD_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
+
+ SUM_MM_SSE2 xmm6, xmm7
+
+ pop ebx
+ ret
+
----- End forwarded message -----
--
System administration <admin at via.ecp.fr>
VIA, Ecole Centrale Paris, France
--
This is the x264-devel mailing-list
To unsubscribe, go to: http://developers.videolan.org/lists.html
More information about the x264-devel
mailing list