[x264-devel] [Alexander Izvorski <aizvorski at gmail.com>] [patch] SSE2 pixel routines

Fri Jul 22 18:42:58 CEST 2005

The deleted attachment is at:
    <http://www.videolan.org/~admin/20050722-videolan/sse2-pixel-routines.diff>

----- Forwarded message from Alexander Izvorski <aizvorski at gmail.com> -----

From: Alexander Izvorski <aizvorski at gmail.com>
Date: Fri, 22 Jul 2005 00:18:57 -0700
To: x264-devel at videolan.org
Subject: [patch] SSE2 pixel routines
Reply-To: Alexander Izvorski <aizvorski at gmail.com>
X-Spam-Status: No, score=-9.4 required=5.0 tests=IN_REP_TO,RCVD_BY_IP,
	UNIFIED_PATCH autolearn=failed version=3.0.3

Hello, 

Here is an early version of SSE2-optimized routines for sad 16x16 and
16x8, ssd 16x16 and 16x8, and satd from 16x16 to 8x4 (diff against rev
277).  None of these have any special alignment requirements.  I have
tested that they produce the same results as the mmxext versions, but
I'd appreciate it if someone else tested them as well.  They are not
in their final form yet, there are a few places where a few more
instructions can be shaved off.

So how fast are they?  They are considerably faster, but I don't know
exactly how much in a typical setup, because the only SSE2-capable
machine I have is a 4-way Xeon box which produces very anomalous
timing results.   The instruction count is certainly lower, from 1268
to 852 in the case of satd16x16.  I would really appreciate numbers
for a single-processor Pentium-4 and Athlon-64/Opteron.  I'll post a
simple benchmarking tool which uses realistic memory access patterns
shortly.

Regards,
-Alex Izvorski


P.S. I've a few questions as well from looking at the original code...

Why is the result of satd divided by two?!  That throws away one bit
of precision which would have a small but noticeable impact on PSNR. 
(see the "shr     eax,1" in MMX_SUM_MM).

Why is MMX_SUM_MM called once for every four 4x4 blocks in satd
functions?  The maximum sum from a 4x4 block, as I understand it, is
2*256*4*4, and that will be split between four unsigned doublewords
with each one getting no more than 256*4*4.  So (even before we divide
the result by two) it is impossible to saturate the result with less
than sixteen 4x4 blocks.  So in theory just one call to MMX_SUM_MM at
the end should be sufficient.

The original version of HADAMARD4_SUB_BADC uses add-add-subtract, is
that faster than the equivalent move-add-subtract?  (not on Athlons,
but maybe on P4?)  The equivalent in my version uses the
move-add-subtract but can be changed very easily.


P.S. If anyone is interested in hacking on these or porting them
(although it's a bit early for that, they will go through at least one
revision), here is some info that may be helpful.  sad and ssd are
straightforward except that psadbw operates on each quadword
separately, the results have to be added together.  satd is tricky, it
loads and differences two 4x4 regions simultaneously, then keeps one
region's differences in registers while doing the transform on the
other (too bad there aren't enough registers to do this in mmx - on
the other hand with 16 128bit registers such as altivec and amd64 have
we could do a 4x8 load ;).  satd data is usually passed around with
one xmm register having row 0 in the low quadword and row 2 (!) in the
high quadword, and the other register having row 1 and row 3. 
hadamard4x4 takes this as input and produces outputs which are the
same but with high quadwords swapped.  transpose4x4 expects that.  oh,
and by the way psrldq takes a shift in bytes not bits (and isn't that
confusing).  hope this helps.

Index: common/i386/pixel.h
===================================================================

--- common/i386/pixel.h	(revision 277)
+++ common/i386/pixel.h	(working copy)
@@ -48,4 +48,16 @@
 int x264_pixel_satd_4x8_mmxext( uint8_t *, int, uint8_t *, int );
 int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int );
 
+int x264_pixel_sad_16x16_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_16x8_sse2( uint8_t *, int, uint8_t *, int );
+
+int x264_pixel_ssd_16x16_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_16x8_sse2( uint8_t *, int, uint8_t *, int );
+
+int x264_pixel_satd_16x16_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_16x8_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x16_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x8_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_satd_8x4_sse2( uint8_t *, int, uint8_t *, int );
+
 #endif
Index: common/i386/pixel-a.asm
===================================================================
--- common/i386/pixel-a.asm	(revision 277)
+++ common/i386/pixel-a.asm	(working copy)
@@ -53,6 +53,19 @@
     paddw   mm0,    mm3
 %endmacro
 
+%macro SAD_INC_2x16P_SSE2 0
+    movdqu  xmm1,   [eax]
+    movdqu  xmm2,   [eax+ebx]
+    movdqu  xmm3,   [ecx]
+    movdqu  xmm4,   [ecx+edx]
+    psadbw  xmm1,   xmm3
+    psadbw  xmm2,   xmm4
+    lea     eax,    [eax+2*ebx]
+    paddw   xmm1,   xmm2
+    lea     ecx,    [ecx+2*edx]
+    paddw   xmm0,   xmm1
+%endmacro
+
 %macro SAD_INC_2x8P 0
     movq    mm1,    [eax]
     movq    mm2,    [eax+ebx]
@@ -113,6 +126,27 @@
     paddd   mm0,    mm4
 %endmacro
 
+%macro SSD_INC_1x16P_SSE2 0
+    movdqu  xmm1,    [eax]
+    movdqu  xmm2,    [ecx]
+
+    movdqa  xmm5,   xmm1
+    psubusb xmm1,   xmm2
+    psubusb xmm2,   xmm5
+    por     xmm1,   xmm2
+
+    movdqa  xmm2,   xmm1
+    punpcklbw xmm1,  xmm7
+    punpckhbw xmm2,  xmm7
+    pmaddwd xmm1,   xmm1
+    pmaddwd xmm2,   xmm2
+
+    add     eax,    ebx
+    add     ecx,    edx
+    paddd   xmm0,   xmm1
+    paddd   xmm0,   xmm2
+%endmacro
+
 %macro SSD_INC_1x8P 0
     movq    mm1,    [eax]
     movq    mm2,    [ecx]
@@ -161,6 +195,17 @@
     SSD_INC_1x16P
 %endmacro
 
+%macro SSD_INC_8x16P_SSE2 0
+    SSD_INC_1x16P_SSE2
+    SSD_INC_1x16P_SSE2
+    SSD_INC_1x16P_SSE2
+    SSD_INC_1x16P_SSE2
+    SSD_INC_1x16P_SSE2
+    SSD_INC_1x16P_SSE2
+    SSD_INC_1x16P_SSE2
+    SSD_INC_1x16P_SSE2
+%endmacro
+
 %macro SSD_INC_4x8P 0
     SSD_INC_1x8P
     SSD_INC_1x8P
@@ -303,6 +348,17 @@
 cglobal x264_pixel_satd_8x16_mmxext
 cglobal x264_pixel_satd_16x16_mmxext
 
+cglobal x264_pixel_sad_16x16_sse2
+cglobal x264_pixel_sad_16x8_sse2
+cglobal x264_pixel_ssd_16x16_sse2
+cglobal x264_pixel_ssd_16x8_sse2
+cglobal x264_pixel_satd_8x4_sse2
+cglobal x264_pixel_satd_8x8_sse2
+cglobal x264_pixel_satd_16x8_sse2
+cglobal x264_pixel_satd_8x16_sse2
+cglobal x264_pixel_satd_16x16_sse2
+
+
 %macro SAD_START 0
     push    ebx
 
@@ -320,6 +376,27 @@
     ret
 %endmacro
 
+%macro SAD_START_SSE2 0
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    xmm0,   xmm0
+%endmacro
+
+%macro SAD_END_SSE2 0
+    movdqa  xmm1,   xmm0
+    psrldq  xmm1,    8
+    paddw  xmm0, xmm1
+    movd eax,   xmm0
+
+    pop ebx
+    ret
+%endmacro
+
 ALIGN 16
 ;-----------------------------------------------------------------------------
 ;   int __cdecl x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
@@ -338,6 +415,22 @@
 
 ALIGN 16
 ;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_16x16_sse2:
+    SAD_START_SSE2
+    SAD_INC_2x16P_SSE2
+    SAD_INC_2x16P_SSE2
+    SAD_INC_2x16P_SSE2
+    SAD_INC_2x16P_SSE2
+    SAD_INC_2x16P_SSE2
+    SAD_INC_2x16P_SSE2
+    SAD_INC_2x16P_SSE2
+    SAD_INC_2x16P_SSE2
+    SAD_END_SSE2
+
+ALIGN 16
+;-----------------------------------------------------------------------------
 ;   int __cdecl x264_pixel_sad_16x8_mmxext (uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
 x264_pixel_sad_16x8_mmxext:
@@ -350,6 +443,18 @@
 
 ALIGN 16
 ;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_16x8_sse2:
+    SAD_START_SSE2
+    SAD_INC_2x16P_SSE2
+    SAD_INC_2x16P_SSE2
+    SAD_INC_2x16P_SSE2
+    SAD_INC_2x16P_SSE2
+    SAD_END_SSE2
+
+ALIGN 16
+;-----------------------------------------------------------------------------
 ;   int __cdecl x264_pixel_sad_8x16_mmxext (uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
 x264_pixel_sad_8x16_mmxext:
@@ -432,6 +537,32 @@
     ret
 %endmacro
 
+%macro SSD_START_SSE2 0
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    xmm7,   xmm7         ; zero
+    pxor    xmm0,   xmm0         ; mm0 holds the sum
+%endmacro
+
+%macro SSD_END_SSE2 0
+    movdqa  xmm1,   xmm0
+    psrldq  xmm1,    8
+    paddd   xmm0,   xmm1
+
+    movdqa  xmm1,   xmm0
+    psrldq  xmm1,    4
+    paddd   xmm0,   xmm1
+    movd    eax,    xmm0
+
+    pop ebx
+    ret
+%endmacro
+
 ALIGN 16
 ;-----------------------------------------------------------------------------
 ;   int __cdecl x264_pixel_ssd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
@@ -443,12 +574,28 @@
     SSD_END
 
 ALIGN 16
+;-----------------------------------------------------------------------------
+;   int __cdecl x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_ssd_16x16_sse2:
+    SSD_START_SSE2
+    SSD_INC_8x16P_SSE2
+    SSD_INC_8x16P_SSE2
+    SSD_END_SSE2
+
+ALIGN 16
 x264_pixel_ssd_16x8_mmxext:
     SSD_START
     SSD_INC_8x16P
     SSD_END
 
 ALIGN 16
+x264_pixel_ssd_16x8_sse2:
+    SSD_START_SSE2
+    SSD_INC_8x16P_SSE2
+    SSD_END_SSE2
+
+ALIGN 16
 x264_pixel_ssd_8x16_mmxext:
     SSD_START
     SSD_INC_4x8P
@@ -797,3 +944,302 @@
     pop     ebx
     ret
 
+;-----------------------------------------------------------------------------
+
+;;; SSE2 satd stuff
+
+; %1=(row2, row0) %2=(row3, row1) %3=junk
+; output in %1=(row3, row0) and %3=(row2, row1)
+%macro HADAMARD4x4_SSE2   3           
+		
+    movdqa   %3, %1
+    paddw    %1, %2
+    psubw    %3, %2
+    movdqa   %2, %1
+    punpcklqdq %1, %3
+    punpckhqdq %2, %3
+    movdqa   %3, %1
+    paddw    %1, %2
+    psubw    %3, %2
+		
+%endmacro
+
+%macro TRANSPOSE4x4_TWIST_SSE2   3      ; %1=(row3, row0) %2=(row2, row1) %3=junk, output in %1 and %2
+		
+    movdqa   %3, %1
+    punpcklwd %1, %2
+    punpckhwd %2, %3 			; backwards because the high quadwords are already swapped
+
+    movdqa   %3, %1
+    punpckldq %1, %2
+    punpckhdq %3, %2
+
+    movdqa   %2, %1
+    punpcklqdq %1, %3
+    punpckhqdq %2, %3
+		
+%endmacro
+
+;;; loads the difference of two 4x4 blocks into xmm0,xmm1 and xmm4,xmm5 in interleaved-row order
+;;; destroys xmm2, 3 and 7
+%macro LOAD4x8_DIFF_SSE2 0			
+
+	pxor xmm7, xmm7
+
+	movq  xmm0, [eax]
+	movq  xmm1, [eax+ebx]
+	lea eax, [eax+2*ebx]
+	
+	movq  xmm4, [ecx]
+	movq  xmm5, [ecx+edx]
+	lea ecx, [ecx+2*edx]
+
+	movq  xmm2, [eax]
+	movq  xmm3, [eax+ebx]
+	lea eax, [eax+2*ebx]
+	
+	punpcklbw xmm0, xmm7
+	punpcklbw xmm1, xmm7
+	punpcklbw xmm4, xmm7
+	punpcklbw xmm5, xmm7
+
+	psubw xmm0, xmm4	
+	psubw xmm1, xmm5
+
+	movq  xmm4, [ecx]
+	movq  xmm5, [ecx+edx]
+	lea ecx, [ecx+2*edx]
+	
+	punpcklbw xmm2, xmm7
+	punpcklbw xmm3, xmm7
+	punpcklbw xmm4, xmm7
+	punpcklbw xmm5, xmm7
+
+	psubw xmm2, xmm4	
+	psubw xmm3, xmm5
+
+	movdqa xmm4, xmm0
+	movdqa xmm5, xmm1
+	punpcklqdq xmm0, xmm2		; rows 0 and 2
+	punpcklqdq xmm1, xmm3		; rows 1 and 3
+	punpckhqdq xmm4, xmm2		; next 4x4 rows 0 and 2
+	punpckhqdq xmm5, xmm3		; next 4x4 rows 1 and 3
+	
+%endmacro
+			
+%macro SUM4x4_SSE2   4				; 02 13 junk sum
+		
+    pxor    %3, %3
+    psubw   %3, %1
+    pmaxsw  %1, %3
+	
+    pxor    %3, %3
+    psubw   %3, %2
+    pmaxsw  %2, %3
+
+    paddusw %4, %1
+    paddusw %4, %2
+		
+%endmacro
+
+%macro SUM_MM_SSE2 2     ; sum junk
+    movdqa  %2, %1
+    psrldq  %1, 8
+    paddusw %1, %2
+    movdqa  %2, %1
+    psrldq  %1, 4
+    paddusw %1, %2
+    movdqa  %2, %1
+    psrldq  %1, 2
+    paddusw %1, %2
+    movd    eax,%1
+    and     eax,0xffff
+    shr     eax,1
+	pxor    %1, %1					; fixme - can save an instruction or two here
+%endmacro
+
+%macro HADAMARD_SSE2   4				; 02 13 junk sum
+	HADAMARD4x4_SSE2 %1, %2, %3
+	TRANSPOSE4x4_TWIST_SSE2 %1, %3, %2
+	HADAMARD4x4_SSE2 %1, %3, %2
+	SUM4x4_SSE2 %1, %2, %3, %4
+%endmacro
+
+ALIGN 16
+x264_pixel_satd_16x16_sse2:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    xmm6,    xmm6
+    xor     ebp,    ebp
+
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+	
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+	
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+
+	SUM_MM_SSE2  xmm6, xmm7
+    add     ebp, eax
+	
+    mov     eax,    [esp+12]
+    mov     ecx,    [esp+20]
+	lea     eax,    [eax+8]		 
+	lea     ecx,    [ecx+8]
+
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+	
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+	
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+
+	SUM_MM_SSE2  xmm6, xmm7
+    add     ebp, eax
+	mov     eax, ebp
+		
+    pop     ebp
+    pop     ebx
+    ret
+
+ALIGN 16
+x264_pixel_satd_8x16_sse2:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    xmm6,    xmm6
+
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+	
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+	
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+
+	SUM_MM_SSE2  xmm6, xmm7
+
+    pop     ebp
+    pop     ebx
+    ret
+
+ALIGN 16
+x264_pixel_satd_16x8_sse2:
+    push    ebx
+    push    ebp
+
+    mov     eax,    [esp+12]    ; pix1
+    mov     ebx,    [esp+16]    ; stride1
+    mov     ecx,    [esp+20]    ; pix2
+    mov     edx,    [esp+24]    ; stride2
+
+    pxor    xmm6,    xmm6
+
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+	
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+
+    mov     eax,    [esp+12]
+    mov     ecx,    [esp+20]
+	lea     eax,    [eax+8]		 
+	lea     ecx,    [ecx+8]
+
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+	
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+
+	SUM_MM_SSE2  xmm6, xmm7
+
+    pop     ebp
+    pop     ebx
+    ret
+
+ALIGN 16
+x264_pixel_satd_8x8_sse2:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    xmm6,    xmm6
+
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+	
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+
+	SUM_MM_SSE2  xmm6, xmm7
+
+    pop     ebx
+    ret
+
+ALIGN 16
+x264_pixel_satd_8x4_sse2:
+    push    ebx
+
+    mov     eax,    [esp+ 8]    ; pix1
+    mov     ebx,    [esp+12]    ; stride1
+    mov     ecx,    [esp+16]    ; pix2
+    mov     edx,    [esp+20]    ; stride2
+
+    pxor    xmm6,    xmm6
+
+	LOAD4x8_DIFF_SSE2
+	HADAMARD_SSE2 xmm0, xmm1, xmm7, xmm6
+	HADAMARD_SSE2 xmm4, xmm5, xmm7, xmm6
+	
+	SUM_MM_SSE2  xmm6, xmm7
+
+    pop     ebx
+    ret
+
Index: common/pixel.c
===================================================================
--- common/pixel.c	(revision 277)
+++ common/pixel.c	(working copy)
@@ -29,6 +29,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdarg.h>
+#include <stdio.h>
 
 #include "x264.h"
 #include "pixel.h"
@@ -431,6 +432,24 @@
         pixf->satd[PIXEL_4x4]  = x264_pixel_satd_4x4_mmxext;
     }
 #endif
+
+#ifdef HAVE_SSE2
+    if( cpu&X264_CPU_SSE2 )
+    {
+        pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_sse2;
+        pixf->sad[PIXEL_16x8 ] = x264_pixel_sad_16x8_sse2;
+
+        pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_sse2;
+        pixf->ssd[PIXEL_16x8]  = x264_pixel_ssd_16x8_sse2;
+
+        pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_sse2;
+        pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_sse2;
+        pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_sse2;
+        pixf->satd[PIXEL_8x8]  = x264_pixel_satd_8x8_sse2;
+        pixf->satd[PIXEL_8x4]  = x264_pixel_satd_8x4_sse2;
+    }
+#endif
+
 #ifdef ARCH_PPC
     if( cpu&X264_CPU_ALTIVEC )
     {


----- End forwarded message -----

-- 
System administration <admin at via.ecp.fr>
VIA, Ecole Centrale Paris, France

-- 
This is the x264-devel mailing-list
To unsubscribe, go to: http://developers.videolan.org/lists.html