[x264-devel] [patch] SSE2 motion compensation [pixel_avg w16/w20, mc_copy w16, pixel_avg_16x16, pixel_avg_16x8] + pixel_avg w12

Mon Mar 3 01:40:42 CET 2008

Index: common/mc.c
===================================================================

--- common/mc.c    (revision 745)
+++ common/mc.c    (working copy)
@@ -377,7 +377,9 @@
 #ifdef HAVE_MMX
     x264_mc_init_mmx( cpu, pf );
     if( cpu&X264_CPU_MMXEXT )
+    {
         pf->mc_chroma = x264_mc_chroma_mmxext;
+    }
 #endif
 #ifdef ARCH_PPC
     if( cpu&X264_CPU_ALTIVEC )
Index: common/i386/mc-a.asm
===================================================================
--- common/i386/mc-a.asm    (revision 745)
+++ common/i386/mc-a.asm    (working copy)
@@ -128,21 +128,23 @@
     movdqa [eax+ebx], xmm1
 AVG_END

-%macro AVGH 2
-cglobal x264_pixel_avg_%1x%2_mmxext
+%macro AVGH 3
+cglobal x264_pixel_avg_%1x%2_%3
     push esi
     mov esi, %2
-    jmp x264_pixel_avg_w%1_mmxext
+    jmp x264_pixel_avg_w%1_%3
 %endmacro

-AVGH 16, 16
-AVGH 16, 8
-AVGH 8, 16
-AVGH 8, 8
-AVGH 8, 4
-AVGH 4, 8
-AVGH 4, 4
-AVGH 4, 2
+AVGH 16, 16, mmxext
+AVGH 16, 8, mmxext
+AVGH 8, 16, mmxext
+AVGH 8, 8, mmxext
+AVGH 8, 4, mmxext
+AVGH 4, 8, mmxext
+AVGH 4, 4, mmxext
+AVGH 4, 2, mmxext
+AVGH 16, 16, sse2
+AVGH 16, 8, sse2

 %macro AVG2_START 1
 cglobal %1
@@ -191,6 +193,21 @@
     movq   [eax+ebx], mm1
 AVG2_END

+AVG2_START x264_pixel_avg2_w12_mmxext
+    movq   mm0, [ecx]
+    movd   mm1, [ecx+8]
+    movq   mm2, [ecx+edx]
+    movd   mm3, [ecx+edx+8]
+    pavgb  mm0, [ecx+edi]
+    pavgb  mm1, [ecx+edi+8]
+    pavgb  mm2, [ecx+ebp]
+    pavgb  mm3, [ecx+ebp+8]
+    movq   [eax], mm0
+    movd   [eax+8], mm1
+    movq   [eax+ebx], mm2
+    movd   [eax+ebx+8], mm3
+AVG2_END
+
 AVG2_START x264_pixel_avg2_w16_mmxext
     movq   mm0, [ecx]
     movq   mm1, [ecx+8]
@@ -227,7 +244,33 @@
     movd   [eax+ebx+16], mm5
 AVG2_END

+AVG2_START x264_pixel_avg2_w16_sse2
+    movdqu xmm0, [ecx]
+    movdqu xmm2, [ecx+edi]
+    movdqu xmm1, [ecx+edx]
+    movdqu xmm3, [ecx+ebp]
+    pavgb  xmm0, xmm2
+    pavgb  xmm1, xmm3
+    movdqa [eax], xmm0
+    movdqa [eax+ebx], xmm1
+AVG2_END

+AVG2_START x264_pixel_avg2_w20_sse2
+    movdqu xmm0, [ecx]
+    movdqu xmm2, [ecx+edi]
+    movdqu xmm1, [ecx+edx]
+    movdqu xmm3, [ecx+ebp]
+    movd   mm2,  [ecx+16]
+    movd   mm5,  [ecx+edx+16]
+    pavgb  xmm0, xmm2
+    pavgb  xmm1, xmm3
+    pavgb  mm2,  [ecx+edi+16]
+    pavgb  mm5,  [ecx+ebp+16]
+    movdqa [eax], xmm0
+    movd   [eax+16], mm2
+    movdqa [eax+ebx], xmm1
+    movd   [eax+ebx+16], mm5
+AVG2_END

 ;=============================================================================
 ; weighted prediction
Index: common/i386/mc-c.c
===================================================================
--- common/i386/mc-c.c    (revision 745)
+++ common/i386/mc-c.c    (working copy)
@@ -28,6 +28,8 @@
 #include "common/common.h"

 /* NASM functions */
+extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int );
 extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int );
 extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int );
 extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int );
@@ -38,8 +40,11 @@
 extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int );
 extern void x264_pixel_avg2_w4_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
 extern void x264_pixel_avg2_w8_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
+extern void x264_pixel_avg2_w12_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
 extern void x264_pixel_avg2_w16_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
 extern void x264_pixel_avg2_w20_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
+extern void x264_pixel_avg2_w16_sse2( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
+extern void x264_pixel_avg2_w20_sse2( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
 extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *,
int, int );
 extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *,
int, int, int );
 extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *,
int, int, int );
@@ -69,9 +74,9 @@
     NULL,
     x264_pixel_avg2_w4_mmxext,
     x264_pixel_avg2_w8_mmxext,
+    x264_pixel_avg2_w12_mmxext,
     x264_pixel_avg2_w16_mmxext,
-    x264_pixel_avg2_w16_mmxext,
-    x264_pixel_avg2_w20_mmxext,
+    x264_pixel_avg2_w20_mmxext
 };
 static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *,
int, int ) =
 {
@@ -81,56 +86,80 @@
     NULL,
     x264_mc_copy_w16_mmx
 };
+static void (* const x264_pixel_avg_wtab_sse2[6])( uint8_t *, int, uint8_t
*, int, uint8_t *, int ) =
+{
+    NULL,
+    x264_pixel_avg2_w4_mmxext,
+    x264_pixel_avg2_w8_mmxext,
+    x264_pixel_avg2_w12_mmxext,
+    x264_pixel_avg2_w16_sse2,
+    x264_pixel_avg2_w20_sse2
+};
+static void (* const x264_mc_copy_wtab_sse2[5])( uint8_t *, int, uint8_t *,
int, int ) =
+{
+    NULL,
+    x264_mc_copy_w4_mmx,
+    x264_mc_copy_w8_mmx,
+    NULL,
+    x264_mc_copy_w16_sse2
+};
 static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
 static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};

-void mc_luma_mmxext( uint8_t *dst,    int i_dst_stride,
-                     uint8_t *src[4], int i_src_stride,
-                     int mvx, int mvy,
-                     int i_width, int i_height )
-{
-    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) *
i_src_stride;
-
-    if( qpel_idx & 5 ) /* qpel interpolation needed */
-    {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
-        x264_pixel_avg_wtab_mmxext[i_width>>2](
-                dst, i_dst_stride, src1, i_src_stride,
-                src2, i_height );
-    }
-    else
-    {
-        x264_mc_copy_wtab_mmx[i_width>>2](
-                dst, i_dst_stride, src1, i_src_stride, i_height );
-    }
+#define MC_LUMA(name,instr1,instr2)\
+void mc_luma_##name( uint8_t *dst,    int i_dst_stride,\
+                  uint8_t *src[4], int i_src_stride,\
+                  int mvx, int mvy,\
+                  int i_width, int i_height )\
+{\
+    int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
+    int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
+    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) *
i_src_stride;\
+    if( qpel_idx & 5 ) /* qpel interpolation needed */\
+    {\
+        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) ==
3);\
+        x264_pixel_avg_wtab_##instr1[i_width>>2](\
+                dst, i_dst_stride, src1, i_src_stride,\
+                src2, i_height );\
+    }\
+    else\
+    {\
+        x264_mc_copy_wtab_##instr2[i_width>>2](\
+                dst, i_dst_stride, src1, i_src_stride, i_height );\
+    }\
 }

-uint8_t *get_ref_mmxext( uint8_t *dst,   int *i_dst_stride,
-                         uint8_t *src[4], int i_src_stride,
-                         int mvx, int mvy,
-                         int i_width, int i_height )
-{
-    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) *
i_src_stride;
-
-    if( qpel_idx & 5 ) /* qpel interpolation needed */
-    {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
-        x264_pixel_avg_wtab_mmxext[i_width>>2](
-                dst, *i_dst_stride, src1, i_src_stride,
-                src2, i_height );
-        return dst;
-    }
-    else
-    {
-        *i_dst_stride = i_src_stride;
-        return src1;
-    }
+MC_LUMA(mmxext,mmxext,mmx)
+MC_LUMA(sse2,sse2,sse2)
+#include "bench.h"
+#define GET_REF(name)\
+uint8_t *get_ref_##name( uint8_t *dst,   int *i_dst_stride,\
+                         uint8_t *src[4], int i_src_stride,\
+                         int mvx, int mvy,\
+                         int i_width, int i_height )\
+{\
+    int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
+    int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
+    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) *
i_src_stride;\
+    if( qpel_idx & 5 ) /* qpel interpolation needed */\
+    {\
+        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) ==
3);\
+        START_TIMER;\
+        x264_pixel_avg_wtab_##name[i_width>>2](\
+                dst, *i_dst_stride, src1, i_src_stride,\
+                src2, i_height );\
+        if(i_width>>2 == 5) STOP_TIMER("w20");\
+        return dst;\
+    }\
+    else\
+    {\
+        *i_dst_stride = i_src_stride;\
+        return src1;\
+    }\
 }

+GET_REF(mmxext)
+GET_REF(sse2)

 void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 {
@@ -169,6 +198,12 @@

     pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
     pf->prefetch_ref  = x264_prefetch_ref_mmxext;
-
-    /* todo: use sse2 */
+    // disable on AMD processors since it is slower
+    if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) )
+    {
+        pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
+        pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
+        pf->mc_luma = mc_luma_sse2;
+        pf->get_ref = get_ref_sse2;
+    }
 }
Index: tools/checkasm.c
===================================================================
--- tools/checkasm.c    (revision 745)
+++ tools/checkasm.c    (working copy)
@@ -407,8 +407,8 @@
     uint8_t *src     = &buf1[2*32+2];
     uint8_t *src2[4] = { &buf1[2*32+2],  &buf1[6*32+2],
                          &buf1[10*32+2], &buf1[14*32+2] };
-    uint8_t *dst1    = &buf3[2*32+2];
-    uint8_t *dst2    = &buf4[2*32+2];
+    uint8_t *dst1    = &buf3[2*32];
+    uint8_t *dst2    = &buf4[2*32];

     int dx, dy, i, j, w;
     int ret = 0, ok, used_asm;
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://mailman.videolan.org/pipermail/x264-devel/attachments/20080302/e64ed916/attachment-0001.htm