[x264-devel] [PATCH] SSE2 motion compensation [from before] + MMX/SSE2/SSSE3 frame_lowres_init

Mon Mar 3 20:54:09 CET 2008

Index: common/mc.c
===================================================================

--- common/mc.c    (revision 745)
+++ common/mc.c    (working copy)
@@ -336,6 +336,52 @@
 void prefetch_ref_null( uint8_t *pix, int stride, int parity )
 {}

+void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
+{
+    const int i_stride = frame->i_stride[0];
+    const int i_stride2 = frame->i_stride_lowres;
+    const int i_width2 = frame->i_width_lowres;
+    int x, y;
+    uint8_t *src0 = frame->plane[0];
+    uint8_t *dst0 = frame->lowres[0];
+    uint8_t *dsth = frame->lowres[1];
+    uint8_t *dstv = frame->lowres[2];
+    uint8_t *dstc = frame->lowres[3];
+    /* Duplicate last column and row of pixels. */
+    for(y=0; y<frame->i_lines[0]; y++) src0[frame->i_width[0]+y*i_stride] =
src0[frame->i_width[0]-1+y*i_stride];
+    for(y=0; y<frame->i_width[0]; y++) src0[y+i_stride*frame->i_lines[0]] =
src0[y+i_stride*(frame->i_lines[0]-1)];
+    h->mc.frame_init_lowres_core(i_stride, i_stride2,
frame->i_lines_lowres, i_width2, src0, dst0, dsth, dstv, dstc );
+
+    for( y = 0; y < 16; y++ )
+        for( x = 0; x < 16; x++ )
+            frame->i_cost_est[x][y] = -1;
+
+    x264_frame_expand_border_lowres( frame );
+}
+
+void frame_init_lowres_core(int src_stride, int dest_stride, int height,
int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv,
uint8_t *dstc)
+{
+    int x,y;
+    for( y = 0; y < height; y++ )
+    {
+        uint8_t *src1 = src0+src_stride;
+        uint8_t *src2 = src1+src_stride;
+        for( x = 0; x < width; x++ )
+        {
+            //Slower in order to match assembly output.
+            dst0[x] = (((src0[2*x  ] + src0[2*x+1] + 1) >> 1) + ((src1[2*x
] + src1[2*x+1] + 1) >> 1) + 1) >> 1;
+            dsth[x] = (((src0[2*x+1] + src0[2*x+2] + 1) >> 1) +
((src1[2*x+1] + src1[2*x+2] + 1) >> 1) + 1) >> 1;
+            dstv[x] = (((src1[2*x  ] + src1[2*x+1] + 1) >> 1) + ((src2[2*x
] + src2[2*x+1] + 1) >> 1) + 1) >> 1;
+            dstc[x] = (((src1[2*x+1] + src1[2*x+2] + 1) >> 1) +
((src2[2*x+1] + src2[2*x+2] + 1) >> 1) + 1) >> 1;
+        }
+        src0 += src_stride*2;
+        dst0 += dest_stride;
+        dsth += dest_stride;
+        dstv += dest_stride;
+        dstc += dest_stride;
+    }
+}
+
 void x264_mc_init( int cpu, x264_mc_functions_t *pf )
 {
     pf->mc_luma   = mc_luma;
@@ -373,11 +419,14 @@

     pf->prefetch_fenc = prefetch_fenc_null;
     pf->prefetch_ref  = prefetch_ref_null;
+    pf->frame_init_lowres_core = frame_init_lowres_core;

 #ifdef HAVE_MMX
     x264_mc_init_mmx( cpu, pf );
     if( cpu&X264_CPU_MMXEXT )
+    {
         pf->mc_chroma = x264_mc_chroma_mmxext;
+    }
 #endif
 #ifdef ARCH_PPC
     if( cpu&X264_CPU_ALTIVEC )
@@ -442,42 +491,3 @@
         }
     }
 }
-
-void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
-{
-    // FIXME: tapfilter?
-    const int i_stride = frame->i_stride[0];
-    const int i_stride2 = frame->i_stride_lowres;
-    const int i_width2 = frame->i_width_lowres;
-    int x, y, i;
-    for( y = 0; y < frame->i_lines_lowres - 1; y++ )
-    {
-        uint8_t *src0 = &frame->plane[0][2*y*i_stride];
-        uint8_t *src1 = src0+i_stride;
-        uint8_t *src2 = src1+i_stride;
-        uint8_t *dst0 = &frame->lowres[0][y*i_stride2];
-        uint8_t *dsth = &frame->lowres[1][y*i_stride2];
-        uint8_t *dstv = &frame->lowres[2][y*i_stride2];
-        uint8_t *dstc = &frame->lowres[3][y*i_stride2];
-        for( x = 0; x < i_width2 - 1; x++ )
-        {
-            dst0[x] = (src0[2*x  ] + src0[2*x+1] + src1[2*x  ] +
src1[2*x+1] + 2) >> 2;
-            dsth[x] = (src0[2*x+1] + src0[2*x+2] + src1[2*x+1] +
src1[2*x+2] + 2) >> 2;
-            dstv[x] = (src1[2*x  ] + src1[2*x+1] + src2[2*x  ] +
src2[2*x+1] + 2) >> 2;
-            dstc[x] = (src1[2*x+1] + src1[2*x+2] + src2[2*x+1] +
src2[2*x+2] + 2) >> 2;
-        }
-        dst0[x] = (src0[2*x  ] + src0[2*x+1] + src1[2*x  ] + src1[2*x+1] +
2) >> 2;
-        dstv[x] = (src1[2*x  ] + src1[2*x+1] + src2[2*x  ] + src2[2*x+1] +
2) >> 2;
-        dsth[x] = (src0[2*x+1] + src1[2*x+1] + 1) >> 1;
-        dstc[x] = (src1[2*x+1] + src2[2*x+1] + 1) >> 1;
-    }
-    for( i = 0; i < 4; i++ )
-        memcpy( &frame->lowres[i][y*i_stride2],
&frame->lowres[i][(y-1)*i_stride2], i_width2 );
-
-    for( y = 0; y < 16; y++ )
-        for( x = 0; x < 16; x++ )
-            frame->i_cost_est[x][y] = -1;
-
-    x264_frame_expand_border_lowres( frame );
-}
-
Index: common/mc.h
===================================================================
--- common/mc.h    (revision 745)
+++ common/mc.h    (working copy)
@@ -65,7 +65,9 @@
                            uint8_t *pix_uv, int stride_uv, int mb_x );
     /* prefetch the next few macroblocks of a hpel reference frame */
     void (*prefetch_ref)( uint8_t *pix, int stride, int parity );
-
+
+    /* Lowres frame context init */
+    void (*frame_init_lowres_core)( int src_stride, int dest_stride, int
height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t
*dstv, uint8_t *dstc );
 } x264_mc_functions_t;

 void x264_mc_init( int cpu, x264_mc_functions_t *pf );
Index: common/i386/mc-a.asm
===================================================================
--- common/i386/mc-a.asm    (revision 745)
+++ common/i386/mc-a.asm    (working copy)
@@ -128,21 +128,23 @@
     movdqa [eax+ebx], xmm1
 AVG_END

-%macro AVGH 2
-cglobal x264_pixel_avg_%1x%2_mmxext
+%macro AVGH 3
+cglobal x264_pixel_avg_%1x%2_%3
     push esi
     mov esi, %2
-    jmp x264_pixel_avg_w%1_mmxext
+    jmp x264_pixel_avg_w%1_%3
 %endmacro

-AVGH 16, 16
-AVGH 16, 8
-AVGH 8, 16
-AVGH 8, 8
-AVGH 8, 4
-AVGH 4, 8
-AVGH 4, 4
-AVGH 4, 2
+AVGH 16, 16, mmxext
+AVGH 16, 8, mmxext
+AVGH 8, 16, mmxext
+AVGH 8, 8, mmxext
+AVGH 8, 4, mmxext
+AVGH 4, 8, mmxext
+AVGH 4, 4, mmxext
+AVGH 4, 2, mmxext
+AVGH 16, 16, sse2
+AVGH 16, 8, sse2

 %macro AVG2_START 1
 cglobal %1
@@ -191,6 +193,21 @@
     movq   [eax+ebx], mm1
 AVG2_END

+AVG2_START x264_pixel_avg2_w12_mmxext
+    movq   mm0, [ecx]
+    movd   mm1, [ecx+8]
+    movq   mm2, [ecx+edx]
+    movd   mm3, [ecx+edx+8]
+    pavgb  mm0, [ecx+edi]
+    pavgb  mm1, [ecx+edi+8]
+    pavgb  mm2, [ecx+ebp]
+    pavgb  mm3, [ecx+ebp+8]
+    movq   [eax], mm0
+    movd   [eax+8], mm1
+    movq   [eax+ebx], mm2
+    movd   [eax+ebx+8], mm3
+AVG2_END
+
 AVG2_START x264_pixel_avg2_w16_mmxext
     movq   mm0, [ecx]
     movq   mm1, [ecx+8]
@@ -227,7 +244,33 @@
     movd   [eax+ebx+16], mm5
 AVG2_END

+AVG2_START x264_pixel_avg2_w16_sse2
+    movdqu xmm0, [ecx]
+    movdqu xmm2, [ecx+edi]
+    movdqu xmm1, [ecx+edx]
+    movdqu xmm3, [ecx+ebp]
+    pavgb  xmm0, xmm2
+    pavgb  xmm1, xmm3
+    movdqa [eax], xmm0
+    movdqa [eax+ebx], xmm1
+AVG2_END

+AVG2_START x264_pixel_avg2_w20_sse2
+    movdqu xmm0, [ecx]
+    movdqu xmm2, [ecx+edi]
+    movdqu xmm1, [ecx+edx]
+    movdqu xmm3, [ecx+ebp]
+    movd   mm2,  [ecx+16]
+    movd   mm5,  [ecx+edx+16]
+    pavgb  xmm0, xmm2
+    pavgb  xmm1, xmm3
+    pavgb  mm2,  [ecx+edi+16]
+    pavgb  mm5,  [ecx+ebp+16]
+    movdqa [eax], xmm0
+    movd   [eax+16], mm2
+    movdqa [eax+ebx], xmm1
+    movd   [eax+ebx+16], mm5
+AVG2_END

 ;=============================================================================
 ; weighted prediction
Index: common/i386/mc-c.c
===================================================================
--- common/i386/mc-c.c    (revision 745)
+++ common/i386/mc-c.c    (working copy)
@@ -28,6 +28,8 @@
 #include "common/common.h"

 /* NASM functions */
+extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int );
 extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int );
 extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int );
 extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int );
@@ -38,8 +40,11 @@
 extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int );
 extern void x264_pixel_avg2_w4_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
 extern void x264_pixel_avg2_w8_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
+extern void x264_pixel_avg2_w12_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
 extern void x264_pixel_avg2_w16_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
 extern void x264_pixel_avg2_w20_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
+extern void x264_pixel_avg2_w16_sse2( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
+extern void x264_pixel_avg2_w20_sse2( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
 extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *,
int, int );
 extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *,
int, int, int );
 extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *,
int, int, int );
@@ -52,7 +57,16 @@
 extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
 extern void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t
*dstc, uint8_t *src,
                                      int i_stride, int i_width, int
i_height );
+extern void frame_init_lowres_core_ssse3_w64( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+extern void frame_init_lowres_core_ssse3_w32( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+extern void frame_init_lowres_core_ssse3_w16( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+extern void frame_init_lowres_core_sse2_w64( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+extern void frame_init_lowres_core_sse2_w32( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+extern void frame_init_lowres_core_sse2_w16( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+extern void frame_init_lowres_core_mmx_w32( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+extern void frame_init_lowres_core_mmx_w16( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );

+
 #define AVG_WEIGHT(W,H) \
 void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int
i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
 { \
@@ -69,9 +83,9 @@
     NULL,
     x264_pixel_avg2_w4_mmxext,
     x264_pixel_avg2_w8_mmxext,
+    x264_pixel_avg2_w12_mmxext,
     x264_pixel_avg2_w16_mmxext,
-    x264_pixel_avg2_w16_mmxext,
-    x264_pixel_avg2_w20_mmxext,
+    x264_pixel_avg2_w20_mmxext
 };
 static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *,
int, int ) =
 {
@@ -81,57 +95,122 @@
     NULL,
     x264_mc_copy_w16_mmx
 };
+static void (* const x264_pixel_avg_wtab_sse2[6])( uint8_t *, int, uint8_t
*, int, uint8_t *, int ) =
+{
+    NULL,
+    x264_pixel_avg2_w4_mmxext,
+    x264_pixel_avg2_w8_mmxext,
+    x264_pixel_avg2_w12_mmxext,
+    x264_pixel_avg2_w16_sse2,
+    x264_pixel_avg2_w20_sse2
+};
+static void (* const x264_mc_copy_wtab_sse2[5])( uint8_t *, int, uint8_t *,
int, int ) =
+{
+    NULL,
+    x264_mc_copy_w4_mmx,
+    x264_mc_copy_w8_mmx,
+    NULL,
+    x264_mc_copy_w16_sse2
+};
 static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
 static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};

-void mc_luma_mmxext( uint8_t *dst,    int i_dst_stride,
-                     uint8_t *src[4], int i_src_stride,
-                     int mvx, int mvy,
-                     int i_width, int i_height )
-{
-    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) *
i_src_stride;
+#define MC_LUMA(name,instr1,instr2)\
+void mc_luma_##name( uint8_t *dst,    int i_dst_stride,\
+                  uint8_t *src[4], int i_src_stride,\
+                  int mvx, int mvy,\
+                  int i_width, int i_height )\
+{\
+    int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
+    int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
+    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) *
i_src_stride;\
+    if( qpel_idx & 5 ) /* qpel interpolation needed */\
+    {\
+        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) ==
3);\
+        x264_pixel_avg_wtab_##instr1[i_width>>2](\
+                dst, i_dst_stride, src1, i_src_stride,\
+                src2, i_height );\
+    }\
+    else\
+    {\
+        x264_mc_copy_wtab_##instr2[i_width>>2](\
+                dst, i_dst_stride, src1, i_src_stride, i_height );\
+    }\
+}

-    if( qpel_idx & 5 ) /* qpel interpolation needed */
-    {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
-        x264_pixel_avg_wtab_mmxext[i_width>>2](
-                dst, i_dst_stride, src1, i_src_stride,
-                src2, i_height );
-    }
-    else
-    {
-        x264_mc_copy_wtab_mmx[i_width>>2](
-                dst, i_dst_stride, src1, i_src_stride, i_height );
-    }
+MC_LUMA(mmxext,mmxext,mmx)
+MC_LUMA(sse2,sse2,sse2)
+#include "bench.h"
+#define GET_REF(name)\
+uint8_t *get_ref_##name( uint8_t *dst,   int *i_dst_stride,\
+                         uint8_t *src[4], int i_src_stride,\
+                         int mvx, int mvy,\
+                         int i_width, int i_height )\
+{\
+    int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
+    int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
+    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) *
i_src_stride;\
+    if( qpel_idx & 5 ) /* qpel interpolation needed */\
+    {\
+        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) ==
3);\
+        x264_pixel_avg_wtab_##name[i_width>>2](\
+                dst, *i_dst_stride, src1, i_src_stride,\
+                src2, i_height );\
+        return dst;\
+    }\
+    else\
+    {\
+        *i_dst_stride = i_src_stride;\
+        return src1;\
+    }\
 }

-uint8_t *get_ref_mmxext( uint8_t *dst,   int *i_dst_stride,
-                         uint8_t *src[4], int i_src_stride,
-                         int mvx, int mvy,
-                         int i_width, int i_height )
+GET_REF(mmxext)
+GET_REF(sse2)
+
+void frame_init_lowres_core_mmx(int src_stride, int dest_stride, int
height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t
*dstv, uint8_t *dstc)
 {
-    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) *
i_src_stride;
-
-    if( qpel_idx & 5 ) /* qpel interpolation needed */
+    int x;
+    width = width >> 2;
+    for( x = 0; width - x >= 2; x++ )
     {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
-        x264_pixel_avg_wtab_mmxext[i_width>>2](
-                dst, *i_dst_stride, src1, i_src_stride,
-                src2, i_height );
-        return dst;
+        frame_init_lowres_core_sse2_w32(src_stride, dest_stride, height,
width, src0, dst0, dsth, dstv, dstc );
+        src0 += 32;
+        dst0 += 16;
+        dsth += 16;
+        dstv += 16;
+        dstc += 16;
+        x++;
     }
-    else
+    if(width - x == 1)
     {
-        *i_dst_stride = i_src_stride;
-        return src1;
+        frame_init_lowres_core_sse2_w16(src_stride, dest_stride, height,
width, src0, dst0, dsth, dstv, dstc );
     }
 }

+#define FRAME_INIT_SSE(name)\
+void frame_init_lowres_core_##name(int src_stride, int dest_stride, int
height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t
*dstv, uint8_t *dstc)\
+{\
+    int x; width = width >> 3;\
+    for( x = 0; width - x >= 4; x++ )\
+    {\
+        frame_init_lowres_core_##name##_w64(src_stride, dest_stride,
height, width, src0, dst0, dsth, dstv, dstc );\
+        src0 += 64; dst0 += 32; dsth += 32; dstv += 32; dstc += 32; x+=3;\
+    }\
+    for( x = 0; width - x >= 2; x++ )\
+    {\
+        frame_init_lowres_core_##name##_w32(src_stride, dest_stride,
height, width, src0, dst0, dsth, dstv, dstc );\
+        src0 += 32; dst0 += 16; dsth += 16; dstv += 16; dstc += 16; x++;\
+    }\
+    if(width - x == 1)\
+    {\
+        frame_init_lowres_core_##name##_w16(src_stride, dest_stride,
height, width, src0, dst0, dsth, dstv, dstc );\
+    }\
+}

+FRAME_INIT_SSE(sse2)
+FRAME_INIT_SSE(ssse3)
+
 void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 {
     if( !(cpu&X264_CPU_MMX) )
@@ -169,6 +248,19 @@

     pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
     pf->prefetch_ref  = x264_prefetch_ref_mmxext;
-
-    /* todo: use sse2 */
+    // disable on AMD processors since it is slower
+    if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) )
+    {
+        pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
+        pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
+        pf->mc_luma = mc_luma_sse2;
+        pf->get_ref = get_ref_sse2;
+        pf->frame_init_lowres_core = frame_init_lowres_core_sse2;
+    }
+    #ifdef HAVE_SSE3
+    if( cpu&X264_CPU_SSSE3 )
+    {
+        pf->frame_init_lowres_core = frame_init_lowres_core_ssse3;
+    }
+#endif //HAVE_SSE3
 }
Index: common/i386/dct-a.asm
===================================================================
--- common/i386/dct-a.asm    (revision 745)
+++ common/i386/dct-a.asm    (working copy)
@@ -711,7 +711,7 @@
     MMX_STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*6], xmm6, xmm7
     MMX_STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*7], xmm6, xmm7
     ret
-
+
 ;-----------------------------------------------------------------------------
 ;   void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4],
 ;                                     uint8_t *pix1, uint8_t *pix2 )
Index: common/i386/mc-a2.asm
===================================================================
--- common/i386/mc-a2.asm    (revision 745)
+++ common/i386/mc-a2.asm    (working copy)
@@ -33,6 +33,7 @@
 SECTION_RODATA

 ALIGN 16
+pw_255: times 8 dw 255
 pw_1:  times 4 dw 1
 pw_16: times 4 dw 16
 pw_32: times 4 dw 32
@@ -324,4 +325,266 @@
     pop    edi
     emms
     ret
+
+%macro FILTER_START 2
+    picgetgot eax
+    mov%2 %1, [pw_255 GLOBAL]
+    push ebx
+    push edi
+    push esi
+    push ebp
+    mov ebp, [esp+4+16 ]       ;src_stride
+    mov eax, [esp+20+16]       ;source
+    mov ebx, [esp+24+16]       ;dest0
+    mov ecx, [esp+28+16]       ;desth
+    mov edx, [esp+32+16]       ;destv
+    mov edi, [esp+36+16]       ;destc
+    mov esi, [esp+8+16 ]       ;dest_stride
+%endmacro

+%macro FILTER_END 0
+    pop ebp
+    pop esi
+    pop edi
+    pop ebx
+    ret
+%endmacro
+
+%macro FILTER_PREFETCH 1
+    prefetch [eax+ebp*%1]
+    prefetch [ebx+esi*%1]
+    prefetch [ecx+esi*%1]
+    prefetch [edx+esi*%1]
+    prefetch [edi+esi*%1]
+%endmacro
+
+%macro INIT_LOAD 3
+    mov%3 %1, [eax+1+%2]
+    pavgb %1, [eax+%2]
+%endmacro
+
+%macro WIDTH_FILTER 11
+    mov%8 %1, [eax+1+ebp+%6]
+    pavgb %1, [eax+ebp+%6]
+    mov%9 %2, %1
+    pavgb %1, %3
+    mov%9 %3, %2
+    pand %1, %11
+    psrlw %2, 8
+    packuswb %1, %1
+    packuswb %2, %2
+    mov%10 [%4+%7], %1
+    mov%10 [%5+%7], %2
+%endmacro
+
+%macro WIDTH16_FILTER_SSSE3 7
+    movdqa %2, [eax+ebp+%6]
+    movdqa xmm6, [eax+16+ebp+%6]
+    movdqa %1, %2
+    palignr %2, xmm6, 1
+    movdqa %2, %1
+    pavgb %1, %3
+    movdqa %3, %2
+    pand %1, xmm7
+    psrlw %2, 8
+    packuswb %1, %1
+    packuswb %2, %2
+    movq [%4+%7], %1
+    movq [%5+%7], %2
+%endmacro
+
+%macro WIDTH8_FILTER_MMX 7
+    WIDTH_FILTER %1, %2, %3, %4, %5, %6, %7, q, q, d, mm7
+%endmacro
+
+%macro WIDTH16_FILTER_SSE2 7
+    WIDTH_FILTER %1, %2, %3, %4, %5, %6, %7, dqu, dqa, q, xmm7
+%endmacro
+
+%macro LOWRES_FILTER_STEP_W64_SSE 3
+    WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0
+    WIDTH16_FILTER_%3 xmm0, xmm1, xmm3, %1, %2, 16, 8
+    WIDTH16_FILTER_%3 xmm0, xmm1, xmm4, %1, %2, 32, 16
+    WIDTH16_FILTER_%3 xmm0, xmm1, xmm5, %1, %2, 48, 24
+    WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0
+    WIDTH16_FILTER_%3 xmm0, xmm1, xmm3, %1, %2, 16, 8
+    WIDTH16_FILTER_%3 xmm0, xmm1, xmm4, %1, %2, 32, 16
+    WIDTH16_FILTER_%3 xmm0, xmm1, xmm5, %1, %2, 48, 24
+    add eax, ebp
+    add %1, esi
+    add %2, esi
+%endmacro
+
+%macro LOWRES_FILTER_STEP_W32_SSE 3
+    WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0
+    WIDTH16_FILTER_%3 xmm0, xmm1, xmm3, %1, %2, 16, 8
+    WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0
+    WIDTH16_FILTER_%3 xmm0, xmm1, xmm3, %1, %2, 16, 8
+    add eax, ebp
+    add %1, esi
+    add %2, esi
+%endmacro
+
+%macro LOWRES_FILTER_STEP_W16_SSE 3
+    WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0
+    WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0
+    add eax, ebp
+    add %1, esi
+    add %2, esi
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_sse2_w64(  int src_stride, int dest_stride,
int height, int width,
+;                                               uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_sse2_w64
+    FILTER_START xmm7, dqa
+    INIT_LOAD xmm2, 0, dqu
+    INIT_LOAD xmm3, 16, dqu
+    INIT_LOAD xmm4, 32, dqu
+    INIT_LOAD xmm5, 48, dqu
+    .vloop:
+        FILTER_PREFETCH 2
+        LOWRES_FILTER_STEP_W64_SSE ebx, ecx, SSE2
+        LOWRES_FILTER_STEP_W64_SSE edx, edi, SSE2
+        dec dword [esp+12+16]
+        jg .vloop
+    FILTER_END
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_sse2_w32(  int src_stride, int dest_stride,
int height, int width,
+;                                               uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_sse2_w32
+    FILTER_START xmm7, dqa
+    INIT_LOAD xmm2, 0, dqu
+    INIT_LOAD xmm3, 16, dqu
+    .vloop:
+        FILTER_PREFETCH 2
+        LOWRES_FILTER_STEP_W32_SSE ebx, ecx, SSE2
+        LOWRES_FILTER_STEP_W32_SSE edx, edi, SSE2
+        dec dword [esp+12+16]
+        jg .vloop
+    FILTER_END
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_sse2_w16(  int src_stride, int dest_stride,
int height, int width,
+;                                               uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_sse2_w16
+    FILTER_START xmm7, dqa
+    INIT_LOAD xmm2, 0, dqu
+    .vloop:
+        FILTER_PREFETCH 4
+        LOWRES_FILTER_STEP_W16_SSE ebx, ecx, SSE2
+        LOWRES_FILTER_STEP_W16_SSE edx, edi, SSE2
+        dec dword [esp+12+16]
+        jg .vloop
+    FILTER_END
+
+%macro LOWRES_FILTER_STEP_W32_MMX 2
+    WIDTH8_FILTER_MMX mm0, mm1, mm2, %1, %2, 0, 0
+    WIDTH8_FILTER_MMX mm0, mm1, mm3, %1, %2, 8, 4
+    WIDTH8_FILTER_MMX mm0, mm1, mm4, %1, %2, 16, 8
+    WIDTH8_FILTER_MMX mm0, mm1, mm5, %1, %2, 24, 12
+    WIDTH8_FILTER_MMX mm0, mm1, mm2, %1, %2, 0, 0
+    WIDTH8_FILTER_MMX mm0, mm1, mm3, %1, %2, 8, 4
+    WIDTH8_FILTER_MMX mm0, mm1, mm4, %1, %2, 16, 8
+    WIDTH8_FILTER_MMX mm0, mm1, mm5, %1, %2, 24, 12
+    add eax, ebp
+    add %1, esi
+    add %2, esi
+%endmacro
+
+%macro LOWRES_FILTER_STEP_W16_MMX 2
+    WIDTH8_FILTER_MMX mm0, mm1, mm2, %1, %2, 0, 0
+    WIDTH8_FILTER_MMX mm0, mm1, mm3, %1, %2, 8, 4
+    WIDTH8_FILTER_MMX mm0, mm1, mm2, %1, %2, 0, 0
+    WIDTH8_FILTER_MMX mm0, mm1, mm3, %1, %2, 8, 4
+    add eax, ebp
+    add %1, esi
+    add %2, esi
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_mmx_w32(  int src_stride, int dest_stride,
int height, int width,
+;                                               uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_mmx_w32
+    FILTER_START mm7, q
+    INIT_LOAD mm2, 0, q
+    INIT_LOAD mm3, 8, q
+    INIT_LOAD mm4, 16, q
+    INIT_LOAD mm5, 24, q
+    .vloop:
+        FILTER_PREFETCH 2
+        LOWRES_FILTER_STEP_W32_MMX ebx, ecx
+        LOWRES_FILTER_STEP_W32_MMX edx, edi
+        dec dword [esp+12+16]
+        jg .vloop
+    FILTER_END
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_mmx_w16(  int src_stride, int dest_stride,
int height, int width,
+;                                               uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_mmx_w16
+    FILTER_START mm7, q
+    INIT_LOAD mm2, 0, q
+    INIT_LOAD mm3, 8, q
+    .vloop:
+        FILTER_PREFETCH 2
+        LOWRES_FILTER_STEP_W16_MMX ebx, ecx
+        LOWRES_FILTER_STEP_W16_MMX edx, edi
+        dec dword [esp+12+16]
+        jg .vloop
+    FILTER_END
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_ssse3_w64(  int src_stride, int dest_stride,
int height, int width,
+;                                               uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_ssse3_w64
+    FILTER_START xmm7, dqa
+    INIT_LOAD xmm2, 0, dqu
+    INIT_LOAD xmm3, 16, dqu
+    INIT_LOAD xmm4, 32, dqu
+    INIT_LOAD xmm5, 48, dqu
+    .vloop:
+        FILTER_PREFETCH 2
+        LOWRES_FILTER_STEP_W64_SSE ebx, ecx, SSSE3
+        LOWRES_FILTER_STEP_W64_SSE edx, edi, SSSE3
+        dec dword [esp+12+16]
+        jg .vloop
+    FILTER_END
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_ssse3_w32(  int src_stride, int dest_stride,
int height, int width,
+;                                               uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_ssse3_w32
+    FILTER_START xmm7, dqa
+    INIT_LOAD xmm2, 0, dqu
+    INIT_LOAD xmm3, 16, dqu
+    .vloop:
+        FILTER_PREFETCH 2
+        LOWRES_FILTER_STEP_W32_SSE ebx, ecx, SSSE3
+        LOWRES_FILTER_STEP_W32_SSE edx, edi, SSSE3
+        dec dword [esp+12+16]
+        jg .vloop
+    FILTER_END
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_ssse3_w16(  int src_stride, int dest_stride,
int height, int width,
+;                                               uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_ssse3_w16
+    FILTER_START xmm7, dqa
+    INIT_LOAD xmm2, 0, dqu
+    .vloop:
+        FILTER_PREFETCH 4
+        LOWRES_FILTER_STEP_W16_SSE ebx, ecx, SSSE3
+        LOWRES_FILTER_STEP_W16_SSE edx, edi, SSSE3
+        dec dword [esp+12+16]
+        jg .vloop
+    FILTER_END
\ No newline at end of file
Index: tools/checkasm.c
===================================================================
--- tools/checkasm.c    (revision 745)
+++ tools/checkasm.c    (working copy)
@@ -407,8 +407,8 @@
     uint8_t *src     = &buf1[2*32+2];
     uint8_t *src2[4] = { &buf1[2*32+2],  &buf1[6*32+2],
                          &buf1[10*32+2], &buf1[14*32+2] };
-    uint8_t *dst1    = &buf3[2*32+2];
-    uint8_t *dst2    = &buf4[2*32+2];
+    uint8_t *dst1    = &buf3[2*32];
+    uint8_t *dst2    = &buf4[2*32];

     int dx, dy, i, j, w;
     int ret = 0, ok, used_asm;
@@ -519,7 +519,43 @@
     for( w = -64; w <= 128 && ok; w++ )
         MC_TEST_AVG( avg_weight, w );
     report( "mc wpredb :" );
-
+
+    DECLARE_ALIGNED( uint8_t, src1[64*64], 16 );
+    DECLARE_ALIGNED( uint8_t, dst1a[64*64], 16 );
+    DECLARE_ALIGNED( uint8_t, dst2a[64*64], 16 );
+    DECLARE_ALIGNED( uint8_t, dst3a[64*64], 16 );
+    DECLARE_ALIGNED( uint8_t, dst4a[64*64], 16 );
+    DECLARE_ALIGNED( uint8_t, dst1b[64*64], 16 );
+    DECLARE_ALIGNED( uint8_t, dst2b[64*64], 16 );
+    DECLARE_ALIGNED( uint8_t, dst3b[64*64], 16 );
+    DECLARE_ALIGNED( uint8_t, dst4b[64*64], 16 );
+    #define MC_TEST_LOWRES(w,h) \
+    if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core ) \
+    { \
+        used_asm = 1; \
+        memset(src1, 0xCD, w*h); \
+        mc_c.frame_init_lowres_core( w, w/2, h/2, w/2, src1, dst1a, dst2a,
dst3a, dst4a);\
+        mc_a.frame_init_lowres_core( w, w/2, h/2, w/2, src1, dst1b, dst2b,
dst3b, dst4b);\
+        if( memcmp( dst1a, dst1b, w*h/4 ) && memcmp( dst2a, dst2b, w*h/4 )\
+         && memcmp( dst3a, dst3b, w*h/4 ) && memcmp( dst4a, dst4b, w*h/4 ))
\
+        { \
+            ok = 0; \
+        } \
+    }
+    for( j = 0; j < 1000; j++)
+    {
+        for( i = 0; i < 64*64; i++ )
+        {
+            src1[i] = rand() & 0xFF;
+        }
+        MC_TEST_LOWRES(16,16);
+        MC_TEST_LOWRES(32,48);
+        MC_TEST_LOWRES(48,16);
+        MC_TEST_LOWRES(16,32);
+        MC_TEST_LOWRES(32,16);
+        MC_TEST_LOWRES(32,32);
+    }
+    report( "frame_init_lowres:" );
     return ret;
 }
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://mailman.videolan.org/pipermail/x264-devel/attachments/20080303/063f6f5d/attachment-0001.htm