[x264-devel] [PATCH] SSE2 motion compensation [from before] + MMX/SSE2/SSSE3 frame_lowres_init
Jason Garrett-Glaser
darkshikari at gmail.com
Mon Mar 3 20:54:09 CET 2008
Index: common/mc.c
===================================================================
--- common/mc.c (revision 745)
+++ common/mc.c (working copy)
@@ -336,6 +336,52 @@
void prefetch_ref_null( uint8_t *pix, int stride, int parity )
{}
+void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
+{
+ const int i_stride = frame->i_stride[0];
+ const int i_stride2 = frame->i_stride_lowres;
+ const int i_width2 = frame->i_width_lowres;
+ int x, y;
+ uint8_t *src0 = frame->plane[0];
+ uint8_t *dst0 = frame->lowres[0];
+ uint8_t *dsth = frame->lowres[1];
+ uint8_t *dstv = frame->lowres[2];
+ uint8_t *dstc = frame->lowres[3];
+ /* Duplicate last column and row of pixels. */
+ for(y=0; y<frame->i_lines[0]; y++) src0[frame->i_width[0]+y*i_stride] =
src0[frame->i_width[0]-1+y*i_stride];
+ for(y=0; y<frame->i_width[0]; y++) src0[y+i_stride*frame->i_lines[0]] =
src0[y+i_stride*(frame->i_lines[0]-1)];
+ h->mc.frame_init_lowres_core(i_stride, i_stride2,
frame->i_lines_lowres, i_width2, src0, dst0, dsth, dstv, dstc );
+
+ for( y = 0; y < 16; y++ )
+ for( x = 0; x < 16; x++ )
+ frame->i_cost_est[x][y] = -1;
+
+ x264_frame_expand_border_lowres( frame );
+}
+
+void frame_init_lowres_core(int src_stride, int dest_stride, int height,
int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv,
uint8_t *dstc)
+{
+ int x,y;
+ for( y = 0; y < height; y++ )
+ {
+ uint8_t *src1 = src0+src_stride;
+ uint8_t *src2 = src1+src_stride;
+ for( x = 0; x < width; x++ )
+ {
+ //Slower in order to match assembly output.
+ dst0[x] = (((src0[2*x ] + src0[2*x+1] + 1) >> 1) + ((src1[2*x
] + src1[2*x+1] + 1) >> 1) + 1) >> 1;
+ dsth[x] = (((src0[2*x+1] + src0[2*x+2] + 1) >> 1) +
((src1[2*x+1] + src1[2*x+2] + 1) >> 1) + 1) >> 1;
+ dstv[x] = (((src1[2*x ] + src1[2*x+1] + 1) >> 1) + ((src2[2*x
] + src2[2*x+1] + 1) >> 1) + 1) >> 1;
+ dstc[x] = (((src1[2*x+1] + src1[2*x+2] + 1) >> 1) +
((src2[2*x+1] + src2[2*x+2] + 1) >> 1) + 1) >> 1;
+ }
+ src0 += src_stride*2;
+ dst0 += dest_stride;
+ dsth += dest_stride;
+ dstv += dest_stride;
+ dstc += dest_stride;
+ }
+}
+
void x264_mc_init( int cpu, x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma;
@@ -373,11 +419,14 @@
pf->prefetch_fenc = prefetch_fenc_null;
pf->prefetch_ref = prefetch_ref_null;
+ pf->frame_init_lowres_core = frame_init_lowres_core;
#ifdef HAVE_MMX
x264_mc_init_mmx( cpu, pf );
if( cpu&X264_CPU_MMXEXT )
+ {
pf->mc_chroma = x264_mc_chroma_mmxext;
+ }
#endif
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC )
@@ -442,42 +491,3 @@
}
}
}
-
-void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
-{
- // FIXME: tapfilter?
- const int i_stride = frame->i_stride[0];
- const int i_stride2 = frame->i_stride_lowres;
- const int i_width2 = frame->i_width_lowres;
- int x, y, i;
- for( y = 0; y < frame->i_lines_lowres - 1; y++ )
- {
- uint8_t *src0 = &frame->plane[0][2*y*i_stride];
- uint8_t *src1 = src0+i_stride;
- uint8_t *src2 = src1+i_stride;
- uint8_t *dst0 = &frame->lowres[0][y*i_stride2];
- uint8_t *dsth = &frame->lowres[1][y*i_stride2];
- uint8_t *dstv = &frame->lowres[2][y*i_stride2];
- uint8_t *dstc = &frame->lowres[3][y*i_stride2];
- for( x = 0; x < i_width2 - 1; x++ )
- {
- dst0[x] = (src0[2*x ] + src0[2*x+1] + src1[2*x ] +
src1[2*x+1] + 2) >> 2;
- dsth[x] = (src0[2*x+1] + src0[2*x+2] + src1[2*x+1] +
src1[2*x+2] + 2) >> 2;
- dstv[x] = (src1[2*x ] + src1[2*x+1] + src2[2*x ] +
src2[2*x+1] + 2) >> 2;
- dstc[x] = (src1[2*x+1] + src1[2*x+2] + src2[2*x+1] +
src2[2*x+2] + 2) >> 2;
- }
- dst0[x] = (src0[2*x ] + src0[2*x+1] + src1[2*x ] + src1[2*x+1] +
2) >> 2;
- dstv[x] = (src1[2*x ] + src1[2*x+1] + src2[2*x ] + src2[2*x+1] +
2) >> 2;
- dsth[x] = (src0[2*x+1] + src1[2*x+1] + 1) >> 1;
- dstc[x] = (src1[2*x+1] + src2[2*x+1] + 1) >> 1;
- }
- for( i = 0; i < 4; i++ )
- memcpy( &frame->lowres[i][y*i_stride2],
&frame->lowres[i][(y-1)*i_stride2], i_width2 );
-
- for( y = 0; y < 16; y++ )
- for( x = 0; x < 16; x++ )
- frame->i_cost_est[x][y] = -1;
-
- x264_frame_expand_border_lowres( frame );
-}
-
Index: common/mc.h
===================================================================
--- common/mc.h (revision 745)
+++ common/mc.h (working copy)
@@ -65,7 +65,9 @@
uint8_t *pix_uv, int stride_uv, int mb_x );
/* prefetch the next few macroblocks of a hpel reference frame */
void (*prefetch_ref)( uint8_t *pix, int stride, int parity );
-
+
+ /* Lowres frame context init */
+ void (*frame_init_lowres_core)( int src_stride, int dest_stride, int
height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t
*dstv, uint8_t *dstc );
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf );
Index: common/i386/mc-a.asm
===================================================================
--- common/i386/mc-a.asm (revision 745)
+++ common/i386/mc-a.asm (working copy)
@@ -128,21 +128,23 @@
movdqa [eax+ebx], xmm1
AVG_END
-%macro AVGH 2
-cglobal x264_pixel_avg_%1x%2_mmxext
+%macro AVGH 3
+cglobal x264_pixel_avg_%1x%2_%3
push esi
mov esi, %2
- jmp x264_pixel_avg_w%1_mmxext
+ jmp x264_pixel_avg_w%1_%3
%endmacro
-AVGH 16, 16
-AVGH 16, 8
-AVGH 8, 16
-AVGH 8, 8
-AVGH 8, 4
-AVGH 4, 8
-AVGH 4, 4
-AVGH 4, 2
+AVGH 16, 16, mmxext
+AVGH 16, 8, mmxext
+AVGH 8, 16, mmxext
+AVGH 8, 8, mmxext
+AVGH 8, 4, mmxext
+AVGH 4, 8, mmxext
+AVGH 4, 4, mmxext
+AVGH 4, 2, mmxext
+AVGH 16, 16, sse2
+AVGH 16, 8, sse2
%macro AVG2_START 1
cglobal %1
@@ -191,6 +193,21 @@
movq [eax+ebx], mm1
AVG2_END
+AVG2_START x264_pixel_avg2_w12_mmxext
+ movq mm0, [ecx]
+ movd mm1, [ecx+8]
+ movq mm2, [ecx+edx]
+ movd mm3, [ecx+edx+8]
+ pavgb mm0, [ecx+edi]
+ pavgb mm1, [ecx+edi+8]
+ pavgb mm2, [ecx+ebp]
+ pavgb mm3, [ecx+ebp+8]
+ movq [eax], mm0
+ movd [eax+8], mm1
+ movq [eax+ebx], mm2
+ movd [eax+ebx+8], mm3
+AVG2_END
+
AVG2_START x264_pixel_avg2_w16_mmxext
movq mm0, [ecx]
movq mm1, [ecx+8]
@@ -227,7 +244,33 @@
movd [eax+ebx+16], mm5
AVG2_END
+AVG2_START x264_pixel_avg2_w16_sse2
+ movdqu xmm0, [ecx]
+ movdqu xmm2, [ecx+edi]
+ movdqu xmm1, [ecx+edx]
+ movdqu xmm3, [ecx+ebp]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ movdqa [eax], xmm0
+ movdqa [eax+ebx], xmm1
+AVG2_END
+AVG2_START x264_pixel_avg2_w20_sse2
+ movdqu xmm0, [ecx]
+ movdqu xmm2, [ecx+edi]
+ movdqu xmm1, [ecx+edx]
+ movdqu xmm3, [ecx+ebp]
+ movd mm2, [ecx+16]
+ movd mm5, [ecx+edx+16]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ pavgb mm2, [ecx+edi+16]
+ pavgb mm5, [ecx+ebp+16]
+ movdqa [eax], xmm0
+ movd [eax+16], mm2
+ movdqa [eax+ebx], xmm1
+ movd [eax+ebx+16], mm5
+AVG2_END
;=============================================================================
; weighted prediction
Index: common/i386/mc-c.c
===================================================================
--- common/i386/mc-c.c (revision 745)
+++ common/i386/mc-c.c (working copy)
@@ -28,6 +28,8 @@
#include "common/common.h"
/* NASM functions */
+extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int );
@@ -38,8 +40,11 @@
extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w4_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
extern void x264_pixel_avg2_w8_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
+extern void x264_pixel_avg2_w12_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
extern void x264_pixel_avg2_w16_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
extern void x264_pixel_avg2_w20_mmxext( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
+extern void x264_pixel_avg2_w16_sse2( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
+extern void x264_pixel_avg2_w20_sse2( uint8_t *, int, uint8_t *, int,
uint8_t *, int );
extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *,
int, int );
extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *,
int, int, int );
extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *,
int, int, int );
@@ -52,7 +57,16 @@
extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
extern void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t
*dstc, uint8_t *src,
int i_stride, int i_width, int
i_height );
+extern void frame_init_lowres_core_ssse3_w64( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+extern void frame_init_lowres_core_ssse3_w32( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+extern void frame_init_lowres_core_ssse3_w16( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+extern void frame_init_lowres_core_sse2_w64( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+extern void frame_init_lowres_core_sse2_w32( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+extern void frame_init_lowres_core_sse2_w16( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+extern void frame_init_lowres_core_mmx_w32( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+extern void frame_init_lowres_core_mmx_w16( int src_stride, int
dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t
*dsth, uint8_t *dstv, uint8_t *dstc );
+
#define AVG_WEIGHT(W,H) \
void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int
i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
{ \
@@ -69,9 +83,9 @@
NULL,
x264_pixel_avg2_w4_mmxext,
x264_pixel_avg2_w8_mmxext,
+ x264_pixel_avg2_w12_mmxext,
x264_pixel_avg2_w16_mmxext,
- x264_pixel_avg2_w16_mmxext,
- x264_pixel_avg2_w20_mmxext,
+ x264_pixel_avg2_w20_mmxext
};
static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *,
int, int ) =
{
@@ -81,57 +95,122 @@
NULL,
x264_mc_copy_w16_mmx
};
+static void (* const x264_pixel_avg_wtab_sse2[6])( uint8_t *, int, uint8_t
*, int, uint8_t *, int ) =
+{
+ NULL,
+ x264_pixel_avg2_w4_mmxext,
+ x264_pixel_avg2_w8_mmxext,
+ x264_pixel_avg2_w12_mmxext,
+ x264_pixel_avg2_w16_sse2,
+ x264_pixel_avg2_w20_sse2
+};
+static void (* const x264_mc_copy_wtab_sse2[5])( uint8_t *, int, uint8_t *,
int, int ) =
+{
+ NULL,
+ x264_mc_copy_w4_mmx,
+ x264_mc_copy_w8_mmx,
+ NULL,
+ x264_mc_copy_w16_sse2
+};
static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-void mc_luma_mmxext( uint8_t *dst, int i_dst_stride,
- uint8_t *src[4], int i_src_stride,
- int mvx, int mvy,
- int i_width, int i_height )
-{
- int qpel_idx = ((mvy&3)<<2) + (mvx&3);
- int offset = (mvy>>2)*i_src_stride + (mvx>>2);
- uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) *
i_src_stride;
+#define MC_LUMA(name,instr1,instr2)\
+void mc_luma_##name( uint8_t *dst, int i_dst_stride,\
+ uint8_t *src[4], int i_src_stride,\
+ int mvx, int mvy,\
+ int i_width, int i_height )\
+{\
+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
+ int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) *
i_src_stride;\
+ if( qpel_idx & 5 ) /* qpel interpolation needed */\
+ {\
+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) ==
3);\
+ x264_pixel_avg_wtab_##instr1[i_width>>2](\
+ dst, i_dst_stride, src1, i_src_stride,\
+ src2, i_height );\
+ }\
+ else\
+ {\
+ x264_mc_copy_wtab_##instr2[i_width>>2](\
+ dst, i_dst_stride, src1, i_src_stride, i_height );\
+ }\
+}
- if( qpel_idx & 5 ) /* qpel interpolation needed */
- {
- uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
- x264_pixel_avg_wtab_mmxext[i_width>>2](
- dst, i_dst_stride, src1, i_src_stride,
- src2, i_height );
- }
- else
- {
- x264_mc_copy_wtab_mmx[i_width>>2](
- dst, i_dst_stride, src1, i_src_stride, i_height );
- }
+MC_LUMA(mmxext,mmxext,mmx)
+MC_LUMA(sse2,sse2,sse2)
+#include "bench.h"
+#define GET_REF(name)\
+uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
+ uint8_t *src[4], int i_src_stride,\
+ int mvx, int mvy,\
+ int i_width, int i_height )\
+{\
+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
+ int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) *
i_src_stride;\
+ if( qpel_idx & 5 ) /* qpel interpolation needed */\
+ {\
+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) ==
3);\
+ x264_pixel_avg_wtab_##name[i_width>>2](\
+ dst, *i_dst_stride, src1, i_src_stride,\
+ src2, i_height );\
+ return dst;\
+ }\
+ else\
+ {\
+ *i_dst_stride = i_src_stride;\
+ return src1;\
+ }\
}
-uint8_t *get_ref_mmxext( uint8_t *dst, int *i_dst_stride,
- uint8_t *src[4], int i_src_stride,
- int mvx, int mvy,
- int i_width, int i_height )
+GET_REF(mmxext)
+GET_REF(sse2)
+
+void frame_init_lowres_core_mmx(int src_stride, int dest_stride, int
height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t
*dstv, uint8_t *dstc)
{
- int qpel_idx = ((mvy&3)<<2) + (mvx&3);
- int offset = (mvy>>2)*i_src_stride + (mvx>>2);
- uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) *
i_src_stride;
-
- if( qpel_idx & 5 ) /* qpel interpolation needed */
+ int x;
+ width = width >> 2;
+ for( x = 0; width - x >= 2; x++ )
{
- uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
- x264_pixel_avg_wtab_mmxext[i_width>>2](
- dst, *i_dst_stride, src1, i_src_stride,
- src2, i_height );
- return dst;
+ frame_init_lowres_core_sse2_w32(src_stride, dest_stride, height,
width, src0, dst0, dsth, dstv, dstc );
+ src0 += 32;
+ dst0 += 16;
+ dsth += 16;
+ dstv += 16;
+ dstc += 16;
+ x++;
}
- else
+ if(width - x == 1)
{
- *i_dst_stride = i_src_stride;
- return src1;
+ frame_init_lowres_core_sse2_w16(src_stride, dest_stride, height,
width, src0, dst0, dsth, dstv, dstc );
}
}
+#define FRAME_INIT_SSE(name)\
+void frame_init_lowres_core_##name(int src_stride, int dest_stride, int
height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t
*dstv, uint8_t *dstc)\
+{\
+ int x; width = width >> 3;\
+ for( x = 0; width - x >= 4; x++ )\
+ {\
+ frame_init_lowres_core_##name##_w64(src_stride, dest_stride,
height, width, src0, dst0, dsth, dstv, dstc );\
+ src0 += 64; dst0 += 32; dsth += 32; dstv += 32; dstc += 32; x+=3;\
+ }\
+ for( x = 0; width - x >= 2; x++ )\
+ {\
+ frame_init_lowres_core_##name##_w32(src_stride, dest_stride,
height, width, src0, dst0, dsth, dstv, dstc );\
+ src0 += 32; dst0 += 16; dsth += 16; dstv += 16; dstc += 16; x++;\
+ }\
+ if(width - x == 1)\
+ {\
+ frame_init_lowres_core_##name##_w16(src_stride, dest_stride,
height, width, src0, dst0, dsth, dstv, dstc );\
+ }\
+}
+FRAME_INIT_SSE(sse2)
+FRAME_INIT_SSE(ssse3)
+
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
if( !(cpu&X264_CPU_MMX) )
@@ -169,6 +248,19 @@
pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
pf->prefetch_ref = x264_prefetch_ref_mmxext;
-
- /* todo: use sse2 */
+ // disable on AMD processors since it is slower
+ if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) )
+ {
+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
+ pf->mc_luma = mc_luma_sse2;
+ pf->get_ref = get_ref_sse2;
+ pf->frame_init_lowres_core = frame_init_lowres_core_sse2;
+ }
+ #ifdef HAVE_SSE3
+ if( cpu&X264_CPU_SSSE3 )
+ {
+ pf->frame_init_lowres_core = frame_init_lowres_core_ssse3;
+ }
+#endif //HAVE_SSE3
}
Index: common/i386/dct-a.asm
===================================================================
--- common/i386/dct-a.asm (revision 745)
+++ common/i386/dct-a.asm (working copy)
@@ -711,7 +711,7 @@
MMX_STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*6], xmm6, xmm7
MMX_STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*7], xmm6, xmm7
ret
-
+
;-----------------------------------------------------------------------------
; void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4],
; uint8_t *pix1, uint8_t *pix2 )
Index: common/i386/mc-a2.asm
===================================================================
--- common/i386/mc-a2.asm (revision 745)
+++ common/i386/mc-a2.asm (working copy)
@@ -33,6 +33,7 @@
SECTION_RODATA
ALIGN 16
+pw_255: times 8 dw 255
pw_1: times 4 dw 1
pw_16: times 4 dw 16
pw_32: times 4 dw 32
@@ -324,4 +325,266 @@
pop edi
emms
ret
+
+%macro FILTER_START 2
+ picgetgot eax
+ mov%2 %1, [pw_255 GLOBAL]
+ push ebx
+ push edi
+ push esi
+ push ebp
+ mov ebp, [esp+4+16 ] ;src_stride
+ mov eax, [esp+20+16] ;source
+ mov ebx, [esp+24+16] ;dest0
+ mov ecx, [esp+28+16] ;desth
+ mov edx, [esp+32+16] ;destv
+ mov edi, [esp+36+16] ;destc
+ mov esi, [esp+8+16 ] ;dest_stride
+%endmacro
+%macro FILTER_END 0
+ pop ebp
+ pop esi
+ pop edi
+ pop ebx
+ ret
+%endmacro
+
+%macro FILTER_PREFETCH 1
+ prefetch [eax+ebp*%1]
+ prefetch [ebx+esi*%1]
+ prefetch [ecx+esi*%1]
+ prefetch [edx+esi*%1]
+ prefetch [edi+esi*%1]
+%endmacro
+
+%macro INIT_LOAD 3
+ mov%3 %1, [eax+1+%2]
+ pavgb %1, [eax+%2]
+%endmacro
+
+%macro WIDTH_FILTER 11
+ mov%8 %1, [eax+1+ebp+%6]
+ pavgb %1, [eax+ebp+%6]
+ mov%9 %2, %1
+ pavgb %1, %3
+ mov%9 %3, %2
+ pand %1, %11
+ psrlw %2, 8
+ packuswb %1, %1
+ packuswb %2, %2
+ mov%10 [%4+%7], %1
+ mov%10 [%5+%7], %2
+%endmacro
+
+%macro WIDTH16_FILTER_SSSE3 7
+ movdqa %2, [eax+ebp+%6]
+ movdqa xmm6, [eax+16+ebp+%6]
+ movdqa %1, %2
+ palignr %2, xmm6, 1
+ movdqa %2, %1
+ pavgb %1, %3
+ movdqa %3, %2
+ pand %1, xmm7
+ psrlw %2, 8
+ packuswb %1, %1
+ packuswb %2, %2
+ movq [%4+%7], %1
+ movq [%5+%7], %2
+%endmacro
+
+%macro WIDTH8_FILTER_MMX 7
+ WIDTH_FILTER %1, %2, %3, %4, %5, %6, %7, q, q, d, mm7
+%endmacro
+
+%macro WIDTH16_FILTER_SSE2 7
+ WIDTH_FILTER %1, %2, %3, %4, %5, %6, %7, dqu, dqa, q, xmm7
+%endmacro
+
+%macro LOWRES_FILTER_STEP_W64_SSE 3
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm3, %1, %2, 16, 8
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm4, %1, %2, 32, 16
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm5, %1, %2, 48, 24
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm3, %1, %2, 16, 8
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm4, %1, %2, 32, 16
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm5, %1, %2, 48, 24
+ add eax, ebp
+ add %1, esi
+ add %2, esi
+%endmacro
+
+%macro LOWRES_FILTER_STEP_W32_SSE 3
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm3, %1, %2, 16, 8
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm3, %1, %2, 16, 8
+ add eax, ebp
+ add %1, esi
+ add %2, esi
+%endmacro
+
+%macro LOWRES_FILTER_STEP_W16_SSE 3
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0
+ add eax, ebp
+ add %1, esi
+ add %2, esi
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_sse2_w64( int src_stride, int dest_stride,
int height, int width,
+; uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_sse2_w64
+ FILTER_START xmm7, dqa
+ INIT_LOAD xmm2, 0, dqu
+ INIT_LOAD xmm3, 16, dqu
+ INIT_LOAD xmm4, 32, dqu
+ INIT_LOAD xmm5, 48, dqu
+ .vloop:
+ FILTER_PREFETCH 2
+ LOWRES_FILTER_STEP_W64_SSE ebx, ecx, SSE2
+ LOWRES_FILTER_STEP_W64_SSE edx, edi, SSE2
+ dec dword [esp+12+16]
+ jg .vloop
+ FILTER_END
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_sse2_w32( int src_stride, int dest_stride,
int height, int width,
+; uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_sse2_w32
+ FILTER_START xmm7, dqa
+ INIT_LOAD xmm2, 0, dqu
+ INIT_LOAD xmm3, 16, dqu
+ .vloop:
+ FILTER_PREFETCH 2
+ LOWRES_FILTER_STEP_W32_SSE ebx, ecx, SSE2
+ LOWRES_FILTER_STEP_W32_SSE edx, edi, SSE2
+ dec dword [esp+12+16]
+ jg .vloop
+ FILTER_END
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_sse2_w16( int src_stride, int dest_stride,
int height, int width,
+; uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_sse2_w16
+ FILTER_START xmm7, dqa
+ INIT_LOAD xmm2, 0, dqu
+ .vloop:
+ FILTER_PREFETCH 4
+ LOWRES_FILTER_STEP_W16_SSE ebx, ecx, SSE2
+ LOWRES_FILTER_STEP_W16_SSE edx, edi, SSE2
+ dec dword [esp+12+16]
+ jg .vloop
+ FILTER_END
+
+%macro LOWRES_FILTER_STEP_W32_MMX 2
+ WIDTH8_FILTER_MMX mm0, mm1, mm2, %1, %2, 0, 0
+ WIDTH8_FILTER_MMX mm0, mm1, mm3, %1, %2, 8, 4
+ WIDTH8_FILTER_MMX mm0, mm1, mm4, %1, %2, 16, 8
+ WIDTH8_FILTER_MMX mm0, mm1, mm5, %1, %2, 24, 12
+ WIDTH8_FILTER_MMX mm0, mm1, mm2, %1, %2, 0, 0
+ WIDTH8_FILTER_MMX mm0, mm1, mm3, %1, %2, 8, 4
+ WIDTH8_FILTER_MMX mm0, mm1, mm4, %1, %2, 16, 8
+ WIDTH8_FILTER_MMX mm0, mm1, mm5, %1, %2, 24, 12
+ add eax, ebp
+ add %1, esi
+ add %2, esi
+%endmacro
+
+%macro LOWRES_FILTER_STEP_W16_MMX 2
+ WIDTH8_FILTER_MMX mm0, mm1, mm2, %1, %2, 0, 0
+ WIDTH8_FILTER_MMX mm0, mm1, mm3, %1, %2, 8, 4
+ WIDTH8_FILTER_MMX mm0, mm1, mm2, %1, %2, 0, 0
+ WIDTH8_FILTER_MMX mm0, mm1, mm3, %1, %2, 8, 4
+ add eax, ebp
+ add %1, esi
+ add %2, esi
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_mmx_w32( int src_stride, int dest_stride,
int height, int width,
+; uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_mmx_w32
+ FILTER_START mm7, q
+ INIT_LOAD mm2, 0, q
+ INIT_LOAD mm3, 8, q
+ INIT_LOAD mm4, 16, q
+ INIT_LOAD mm5, 24, q
+ .vloop:
+ FILTER_PREFETCH 2
+ LOWRES_FILTER_STEP_W32_MMX ebx, ecx
+ LOWRES_FILTER_STEP_W32_MMX edx, edi
+ dec dword [esp+12+16]
+ jg .vloop
+ FILTER_END
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_mmx_w16( int src_stride, int dest_stride,
int height, int width,
+; uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_mmx_w16
+ FILTER_START mm7, q
+ INIT_LOAD mm2, 0, q
+ INIT_LOAD mm3, 8, q
+ .vloop:
+ FILTER_PREFETCH 2
+ LOWRES_FILTER_STEP_W16_MMX ebx, ecx
+ LOWRES_FILTER_STEP_W16_MMX edx, edi
+ dec dword [esp+12+16]
+ jg .vloop
+ FILTER_END
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_ssse3_w64( int src_stride, int dest_stride,
int height, int width,
+; uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_ssse3_w64
+ FILTER_START xmm7, dqa
+ INIT_LOAD xmm2, 0, dqu
+ INIT_LOAD xmm3, 16, dqu
+ INIT_LOAD xmm4, 32, dqu
+ INIT_LOAD xmm5, 48, dqu
+ .vloop:
+ FILTER_PREFETCH 2
+ LOWRES_FILTER_STEP_W64_SSE ebx, ecx, SSSE3
+ LOWRES_FILTER_STEP_W64_SSE edx, edi, SSSE3
+ dec dword [esp+12+16]
+ jg .vloop
+ FILTER_END
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_ssse3_w32( int src_stride, int dest_stride,
int height, int width,
+; uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_ssse3_w32
+ FILTER_START xmm7, dqa
+ INIT_LOAD xmm2, 0, dqu
+ INIT_LOAD xmm3, 16, dqu
+ .vloop:
+ FILTER_PREFETCH 2
+ LOWRES_FILTER_STEP_W32_SSE ebx, ecx, SSSE3
+ LOWRES_FILTER_STEP_W32_SSE edx, edi, SSSE3
+ dec dword [esp+12+16]
+ jg .vloop
+ FILTER_END
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core_ssse3_w16( int src_stride, int dest_stride,
int height, int width,
+; uint8_t *src0, uint8_t
*dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )
+;-----------------------------------------------------------------------------
+cglobal frame_init_lowres_core_ssse3_w16
+ FILTER_START xmm7, dqa
+ INIT_LOAD xmm2, 0, dqu
+ .vloop:
+ FILTER_PREFETCH 4
+ LOWRES_FILTER_STEP_W16_SSE ebx, ecx, SSSE3
+ LOWRES_FILTER_STEP_W16_SSE edx, edi, SSSE3
+ dec dword [esp+12+16]
+ jg .vloop
+ FILTER_END
\ No newline at end of file
Index: tools/checkasm.c
===================================================================
--- tools/checkasm.c (revision 745)
+++ tools/checkasm.c (working copy)
@@ -407,8 +407,8 @@
uint8_t *src = &buf1[2*32+2];
uint8_t *src2[4] = { &buf1[2*32+2], &buf1[6*32+2],
&buf1[10*32+2], &buf1[14*32+2] };
- uint8_t *dst1 = &buf3[2*32+2];
- uint8_t *dst2 = &buf4[2*32+2];
+ uint8_t *dst1 = &buf3[2*32];
+ uint8_t *dst2 = &buf4[2*32];
int dx, dy, i, j, w;
int ret = 0, ok, used_asm;
@@ -519,7 +519,43 @@
for( w = -64; w <= 128 && ok; w++ )
MC_TEST_AVG( avg_weight, w );
report( "mc wpredb :" );
-
+
+ DECLARE_ALIGNED( uint8_t, src1[64*64], 16 );
+ DECLARE_ALIGNED( uint8_t, dst1a[64*64], 16 );
+ DECLARE_ALIGNED( uint8_t, dst2a[64*64], 16 );
+ DECLARE_ALIGNED( uint8_t, dst3a[64*64], 16 );
+ DECLARE_ALIGNED( uint8_t, dst4a[64*64], 16 );
+ DECLARE_ALIGNED( uint8_t, dst1b[64*64], 16 );
+ DECLARE_ALIGNED( uint8_t, dst2b[64*64], 16 );
+ DECLARE_ALIGNED( uint8_t, dst3b[64*64], 16 );
+ DECLARE_ALIGNED( uint8_t, dst4b[64*64], 16 );
+ #define MC_TEST_LOWRES(w,h) \
+ if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core ) \
+ { \
+ used_asm = 1; \
+ memset(src1, 0xCD, w*h); \
+ mc_c.frame_init_lowres_core( w, w/2, h/2, w/2, src1, dst1a, dst2a,
dst3a, dst4a);\
+ mc_a.frame_init_lowres_core( w, w/2, h/2, w/2, src1, dst1b, dst2b,
dst3b, dst4b);\
+ if( memcmp( dst1a, dst1b, w*h/4 ) && memcmp( dst2a, dst2b, w*h/4 )\
+ && memcmp( dst3a, dst3b, w*h/4 ) && memcmp( dst4a, dst4b, w*h/4 ))
\
+ { \
+ ok = 0; \
+ } \
+ }
+ for( j = 0; j < 1000; j++)
+ {
+ for( i = 0; i < 64*64; i++ )
+ {
+ src1[i] = rand() & 0xFF;
+ }
+ MC_TEST_LOWRES(16,16);
+ MC_TEST_LOWRES(32,48);
+ MC_TEST_LOWRES(48,16);
+ MC_TEST_LOWRES(16,32);
+ MC_TEST_LOWRES(32,16);
+ MC_TEST_LOWRES(32,32);
+ }
+ report( "frame_init_lowres:" );
return ret;
}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://mailman.videolan.org/pipermail/x264-devel/attachments/20080303/063f6f5d/attachment-0001.htm
More information about the x264-devel
mailing list