Index: common/mc.c<br>===================================================================<br>--- common/mc.c (revision 745)<br>+++ common/mc.c (working copy)<br>@@ -336,6 +336,52 @@<br> void prefetch_ref_null( uint8_t *pix, int stride, int parity )<br>
{}<br> <br>+void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )<br>+{<br>+ const int i_stride = frame->i_stride[0];<br>+ const int i_stride2 = frame->i_stride_lowres;<br>+ const int i_width2 = frame->i_width_lowres;<br>
+ int x, y;<br>+ uint8_t *src0 = frame->plane[0];<br>+ uint8_t *dst0 = frame->lowres[0];<br>+ uint8_t *dsth = frame->lowres[1];<br>+ uint8_t *dstv = frame->lowres[2];<br>+ uint8_t *dstc = frame->lowres[3];<br>
+ /* Duplicate last column and row of pixels. */<br>+ for(y=0; y<frame->i_lines[0]; y++) src0[frame->i_width[0]+y*i_stride] = src0[frame->i_width[0]-1+y*i_stride];<br>+ for(y=0; y<frame->i_width[0]; y++) src0[y+i_stride*frame->i_lines[0]] = src0[y+i_stride*(frame->i_lines[0]-1)];<br>
+ h->mc.frame_init_lowres_core(i_stride, i_stride2, frame->i_lines_lowres, i_width2, src0, dst0, dsth, dstv, dstc );<br>+<br>+ for( y = 0; y < 16; y++ )<br>+ for( x = 0; x < 16; x++ )<br>+ frame->i_cost_est[x][y] = -1;<br>
+<br>+ x264_frame_expand_border_lowres( frame );<br>+}<br>+<br>+void frame_init_lowres_core(int src_stride, int dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc)<br>
+{<br>+ int x,y;<br>+ for( y = 0; y < height; y++ )<br>+ {<br>+ uint8_t *src1 = src0+src_stride;<br>+ uint8_t *src2 = src1+src_stride;<br>+ for( x = 0; x < width; x++ )<br>+ {<br>
+ //Slower in order to match assembly output.<br>+ dst0[x] = (((src0[2*x ] + src0[2*x+1] + 1) >> 1) + ((src1[2*x ] + src1[2*x+1] + 1) >> 1) + 1) >> 1;<br>+ dsth[x] = (((src0[2*x+1] + src0[2*x+2] + 1) >> 1) + ((src1[2*x+1] + src1[2*x+2] + 1) >> 1) + 1) >> 1;<br>
+ dstv[x] = (((src1[2*x ] + src1[2*x+1] + 1) >> 1) + ((src2[2*x ] + src2[2*x+1] + 1) >> 1) + 1) >> 1;<br>+ dstc[x] = (((src1[2*x+1] + src1[2*x+2] + 1) >> 1) + ((src2[2*x+1] + src2[2*x+2] + 1) >> 1) + 1) >> 1;<br>
+ }<br>+ src0 += src_stride*2;<br>+ dst0 += dest_stride;<br>+ dsth += dest_stride;<br>+ dstv += dest_stride;<br>+ dstc += dest_stride;<br>+ }<br>+}<br>+<br> void x264_mc_init( int cpu, x264_mc_functions_t *pf )<br>
{<br> pf->mc_luma = mc_luma;<br>@@ -373,11 +419,14 @@<br> <br> pf->prefetch_fenc = prefetch_fenc_null;<br> pf->prefetch_ref = prefetch_ref_null;<br>+ pf->frame_init_lowres_core = frame_init_lowres_core;<br>
<br> #ifdef HAVE_MMX<br> x264_mc_init_mmx( cpu, pf );<br> if( cpu&X264_CPU_MMXEXT )<br>+ {<br> pf->mc_chroma = x264_mc_chroma_mmxext;<br>+ }<br> #endif<br> #ifdef ARCH_PPC<br> if( cpu&X264_CPU_ALTIVEC )<br>
@@ -442,42 +491,3 @@<br> }<br> }<br> }<br>-<br>-void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )<br>-{<br>- // FIXME: tapfilter?<br>- const int i_stride = frame->i_stride[0];<br>- const int i_stride2 = frame->i_stride_lowres;<br>
- const int i_width2 = frame->i_width_lowres;<br>- int x, y, i;<br>- for( y = 0; y < frame->i_lines_lowres - 1; y++ )<br>- {<br>- uint8_t *src0 = &frame->plane[0][2*y*i_stride];<br>- uint8_t *src1 = src0+i_stride;<br>
- uint8_t *src2 = src1+i_stride;<br>- uint8_t *dst0 = &frame->lowres[0][y*i_stride2];<br>- uint8_t *dsth = &frame->lowres[1][y*i_stride2];<br>- uint8_t *dstv = &frame->lowres[2][y*i_stride2];<br>
- uint8_t *dstc = &frame->lowres[3][y*i_stride2];<br>- for( x = 0; x < i_width2 - 1; x++ )<br>- {<br>- dst0[x] = (src0[2*x ] + src0[2*x+1] + src1[2*x ] + src1[2*x+1] + 2) >> 2;<br>
- dsth[x] = (src0[2*x+1] + src0[2*x+2] + src1[2*x+1] + src1[2*x+2] + 2) >> 2;<br>- dstv[x] = (src1[2*x ] + src1[2*x+1] + src2[2*x ] + src2[2*x+1] + 2) >> 2;<br>- dstc[x] = (src1[2*x+1] + src1[2*x+2] + src2[2*x+1] + src2[2*x+2] + 2) >> 2;<br>
- }<br>- dst0[x] = (src0[2*x ] + src0[2*x+1] + src1[2*x ] + src1[2*x+1] + 2) >> 2;<br>- dstv[x] = (src1[2*x ] + src1[2*x+1] + src2[2*x ] + src2[2*x+1] + 2) >> 2;<br>- dsth[x] = (src0[2*x+1] + src1[2*x+1] + 1) >> 1;<br>
- dstc[x] = (src1[2*x+1] + src2[2*x+1] + 1) >> 1;<br>- }<br>- for( i = 0; i < 4; i++ )<br>- memcpy( &frame->lowres[i][y*i_stride2], &frame->lowres[i][(y-1)*i_stride2], i_width2 );<br>
-<br>- for( y = 0; y < 16; y++ )<br>- for( x = 0; x < 16; x++ )<br>- frame->i_cost_est[x][y] = -1;<br>-<br>- x264_frame_expand_border_lowres( frame );<br>-}<br>-<br>Index: common/mc.h<br>===================================================================<br>
--- common/mc.h (revision 745)<br>+++ common/mc.h (working copy)<br>@@ -65,7 +65,9 @@<br> uint8_t *pix_uv, int stride_uv, int mb_x );<br> /* prefetch the next few macroblocks of a hpel reference frame */<br>
void (*prefetch_ref)( uint8_t *pix, int stride, int parity );<br>-<br>+ <br>+ /* Lowres frame context init */<br>+ void (*frame_init_lowres_core)( int src_stride, int dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc );<br>
} x264_mc_functions_t;<br> <br> void x264_mc_init( int cpu, x264_mc_functions_t *pf );<br>Index: common/i386/mc-a.asm<br>===================================================================<br>--- common/i386/mc-a.asm (revision 745)<br>
+++ common/i386/mc-a.asm (working copy)<br>@@ -128,21 +128,23 @@<br> movdqa [eax+ebx], xmm1<br> AVG_END<br> <br>-%macro AVGH 2<br>-cglobal x264_pixel_avg_%1x%2_mmxext<br>+%macro AVGH 3<br>+cglobal x264_pixel_avg_%1x%2_%3<br>
push esi<br> mov esi, %2<br>- jmp x264_pixel_avg_w%1_mmxext<br>+ jmp x264_pixel_avg_w%1_%3<br> %endmacro<br> <br>-AVGH 16, 16<br>-AVGH 16, 8<br>-AVGH 8, 16<br>-AVGH 8, 8<br>-AVGH 8, 4<br>-AVGH 4, 8<br>-AVGH 4, 4<br>
-AVGH 4, 2<br>+AVGH 16, 16, mmxext<br>+AVGH 16, 8, mmxext<br>+AVGH 8, 16, mmxext<br>+AVGH 8, 8, mmxext<br>+AVGH 8, 4, mmxext<br>+AVGH 4, 8, mmxext<br>+AVGH 4, 4, mmxext<br>+AVGH 4, 2, mmxext<br>+AVGH 16, 16, sse2<br>+AVGH 16, 8, sse2<br>
<br> %macro AVG2_START 1<br> cglobal %1<br>@@ -191,6 +193,21 @@<br> movq [eax+ebx], mm1<br> AVG2_END<br> <br>+AVG2_START x264_pixel_avg2_w12_mmxext<br>+ movq mm0, [ecx]<br>+ movd mm1, [ecx+8]<br>+ movq mm2, [ecx+edx]<br>
+ movd mm3, [ecx+edx+8]<br>+ pavgb mm0, [ecx+edi]<br>+ pavgb mm1, [ecx+edi+8]<br>+ pavgb mm2, [ecx+ebp]<br>+ pavgb mm3, [ecx+ebp+8]<br>+ movq [eax], mm0<br>+ movd [eax+8], mm1<br>+ movq [eax+ebx], mm2<br>
+ movd [eax+ebx+8], mm3<br>+AVG2_END<br>+<br> AVG2_START x264_pixel_avg2_w16_mmxext<br> movq mm0, [ecx]<br> movq mm1, [ecx+8]<br>@@ -227,7 +244,33 @@<br> movd [eax+ebx+16], mm5<br> AVG2_END<br> <br>
+AVG2_START x264_pixel_avg2_w16_sse2<br>+ movdqu xmm0, [ecx]<br>+ movdqu xmm2, [ecx+edi]<br>+ movdqu xmm1, [ecx+edx]<br>+ movdqu xmm3, [ecx+ebp]<br>+ pavgb xmm0, xmm2<br>+ pavgb xmm1, xmm3<br>+ movdqa [eax], xmm0<br>
+ movdqa [eax+ebx], xmm1<br>+AVG2_END<br> <br>+AVG2_START x264_pixel_avg2_w20_sse2<br>+ movdqu xmm0, [ecx]<br>+ movdqu xmm2, [ecx+edi]<br>+ movdqu xmm1, [ecx+edx]<br>+ movdqu xmm3, [ecx+ebp]<br>+ movd mm2, [ecx+16]<br>
+ movd mm5, [ecx+edx+16]<br>+ pavgb xmm0, xmm2<br>+ pavgb xmm1, xmm3<br>+ pavgb mm2, [ecx+edi+16]<br>+ pavgb mm5, [ecx+ebp+16]<br>+ movdqa [eax], xmm0<br>+ movd [eax+16], mm2<br>+ movdqa [eax+ebx], xmm1<br>
+ movd [eax+ebx+16], mm5<br>+AVG2_END<br> <br> ;=============================================================================<br> ; weighted prediction<br>Index: common/i386/mc-c.c<br>===================================================================<br>
--- common/i386/mc-c.c (revision 745)<br>+++ common/i386/mc-c.c (working copy)<br>@@ -28,6 +28,8 @@<br> #include "common/common.h"<br> <br> /* NASM functions */<br>+extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int );<br>
+extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int );<br> extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int );<br> extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int );<br>
extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int );<br>@@ -38,8 +40,11 @@<br> extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int );<br> extern void x264_pixel_avg2_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );<br>
extern void x264_pixel_avg2_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );<br>+extern void x264_pixel_avg2_w12_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );<br> extern void x264_pixel_avg2_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );<br>
extern void x264_pixel_avg2_w20_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );<br>+extern void x264_pixel_avg2_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );<br>+extern void x264_pixel_avg2_w20_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );<br>
extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );<br> extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );<br> extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int );<br>
@@ -52,7 +57,16 @@<br> extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );<br> extern void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,<br> int i_stride, int i_width, int i_height );<br>
+extern void frame_init_lowres_core_ssse3_w64( int src_stride, int dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc );<br>+extern void frame_init_lowres_core_ssse3_w32( int src_stride, int dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc );<br>
+extern void frame_init_lowres_core_ssse3_w16( int src_stride, int dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc );<br>+extern void frame_init_lowres_core_sse2_w64( int src_stride, int dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc );<br>
+extern void frame_init_lowres_core_sse2_w32( int src_stride, int dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc );<br>+extern void frame_init_lowres_core_sse2_w16( int src_stride, int dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc );<br>
+extern void frame_init_lowres_core_mmx_w32( int src_stride, int dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc );<br>+extern void frame_init_lowres_core_mmx_w16( int src_stride, int dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc );<br>
<br>+<br> #define AVG_WEIGHT(W,H) \<br> void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \<br> { \<br>@@ -69,9 +83,9 @@<br> NULL,<br> x264_pixel_avg2_w4_mmxext,<br>
x264_pixel_avg2_w8_mmxext,<br>+ x264_pixel_avg2_w12_mmxext,<br> x264_pixel_avg2_w16_mmxext,<br>- x264_pixel_avg2_w16_mmxext,<br>- x264_pixel_avg2_w20_mmxext,<br>+ x264_pixel_avg2_w20_mmxext<br> };<br>
static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *, int, int ) =<br> {<br>@@ -81,57 +95,122 @@<br> NULL,<br> x264_mc_copy_w16_mmx<br> };<br>+static void (* const x264_pixel_avg_wtab_sse2[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =<br>
+{<br>+ NULL,<br>+ x264_pixel_avg2_w4_mmxext,<br>+ x264_pixel_avg2_w8_mmxext,<br>+ x264_pixel_avg2_w12_mmxext,<br>+ x264_pixel_avg2_w16_sse2,<br>+ x264_pixel_avg2_w20_sse2<br>+};<br>+static void (* const x264_mc_copy_wtab_sse2[5])( uint8_t *, int, uint8_t *, int, int ) =<br>
+{<br>+ NULL,<br>+ x264_mc_copy_w4_mmx,<br>+ x264_mc_copy_w8_mmx,<br>+ NULL,<br>+ x264_mc_copy_w16_sse2<br>+};<br> static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};<br> static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};<br>
<br>-void mc_luma_mmxext( uint8_t *dst, int i_dst_stride,<br>- uint8_t *src[4], int i_src_stride,<br>- int mvx, int mvy,<br>- int i_width, int i_height )<br>
-{<br>- int qpel_idx = ((mvy&3)<<2) + (mvx&3);<br>- int offset = (mvy>>2)*i_src_stride + (mvx>>2);<br>- uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;<br>
+#define MC_LUMA(name,instr1,instr2)\<br>+void mc_luma_##name( uint8_t *dst, int i_dst_stride,\<br>+ uint8_t *src[4], int i_src_stride,\<br>+ int mvx, int mvy,\<br>+ int i_width, int i_height )\<br>
+{\<br>+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);\<br>+ int offset = (mvy>>2)*i_src_stride + (mvx>>2);\<br>+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\<br>
+ if( qpel_idx & 5 ) /* qpel interpolation needed */\<br>+ {\<br>+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\<br>+ x264_pixel_avg_wtab_##instr1[i_width>>2](\<br>
+ dst, i_dst_stride, src1, i_src_stride,\<br>+ src2, i_height );\<br>+ }\<br>+ else\<br>+ {\<br>+ x264_mc_copy_wtab_##instr2[i_width>>2](\<br>+ dst, i_dst_stride, src1, i_src_stride, i_height );\<br>
+ }\<br>+}<br> <br>- if( qpel_idx & 5 ) /* qpel interpolation needed */<br>- {<br>- uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);<br>- x264_pixel_avg_wtab_mmxext[i_width>>2](<br>
- dst, i_dst_stride, src1, i_src_stride,<br>- src2, i_height );<br>- }<br>- else<br>- {<br>- x264_mc_copy_wtab_mmx[i_width>>2](<br>- dst, i_dst_stride, src1, i_src_stride, i_height );<br>
- }<br>+MC_LUMA(mmxext,mmxext,mmx)<br>+MC_LUMA(sse2,sse2,sse2)<br>+#include "bench.h"<br>+#define GET_REF(name)\<br>+uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\<br>+ uint8_t *src[4], int i_src_stride,\<br>
+ int mvx, int mvy,\<br>+ int i_width, int i_height )\<br>+{\<br>+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);\<br>+ int offset = (mvy>>2)*i_src_stride + (mvx>>2);\<br>
+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\<br>+ if( qpel_idx & 5 ) /* qpel interpolation needed */\<br>+ {\<br>+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\<br>
+ x264_pixel_avg_wtab_##name[i_width>>2](\<br>+ dst, *i_dst_stride, src1, i_src_stride,\<br>+ src2, i_height );\<br>+ return dst;\<br>+ }\<br>+ else\<br>+ {\<br>+ *i_dst_stride = i_src_stride;\<br>
+ return src1;\<br>+ }\<br> }<br> <br>-uint8_t *get_ref_mmxext( uint8_t *dst, int *i_dst_stride,<br>- uint8_t *src[4], int i_src_stride,<br>- int mvx, int mvy,<br>
- int i_width, int i_height )<br>+GET_REF(mmxext)<br>+GET_REF(sse2)<br>+<br>+void frame_init_lowres_core_mmx(int src_stride, int dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc)<br>
{<br>- int qpel_idx = ((mvy&3)<<2) + (mvx&3);<br>- int offset = (mvy>>2)*i_src_stride + (mvx>>2);<br>- uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;<br>
-<br>- if( qpel_idx & 5 ) /* qpel interpolation needed */<br>+ int x;<br>+ width = width >> 2;<br>+ for( x = 0; width - x >= 2; x++ )<br> {<br>- uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);<br>
- x264_pixel_avg_wtab_mmxext[i_width>>2](<br>- dst, *i_dst_stride, src1, i_src_stride,<br>- src2, i_height );<br>- return dst;<br>+ frame_init_lowres_core_sse2_w32(src_stride, dest_stride, height, width, src0, dst0, dsth, dstv, dstc );<br>
+ src0 += 32;<br>+ dst0 += 16;<br>+ dsth += 16;<br>+ dstv += 16;<br>+ dstc += 16;<br>+ x++;<br> }<br>- else<br>+ if(width - x == 1)<br> {<br>- *i_dst_stride = i_src_stride;<br>
- return src1;<br>+ frame_init_lowres_core_sse2_w16(src_stride, dest_stride, height, width, src0, dst0, dsth, dstv, dstc );<br> }<br> }<br> <br>+#define FRAME_INIT_SSE(name)\<br>+void frame_init_lowres_core_##name(int src_stride, int dest_stride, int height, int width, uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc)\<br>
+{\<br>+ int x; width = width >> 3;\<br>+ for( x = 0; width - x >= 4; x++ )\<br>+ {\<br>+ frame_init_lowres_core_##name##_w64(src_stride, dest_stride, height, width, src0, dst0, dsth, dstv, dstc );\<br>
+ src0 += 64; dst0 += 32; dsth += 32; dstv += 32; dstc += 32; x+=3;\<br>+ }\<br>+ for( x = 0; width - x >= 2; x++ )\<br>+ {\<br>+ frame_init_lowres_core_##name##_w32(src_stride, dest_stride, height, width, src0, dst0, dsth, dstv, dstc );\<br>
+ src0 += 32; dst0 += 16; dsth += 16; dstv += 16; dstc += 16; x++;\<br>+ }\<br>+ if(width - x == 1)\<br>+ {\<br>+ frame_init_lowres_core_##name##_w16(src_stride, dest_stride, height, width, src0, dst0, dsth, dstv, dstc );\<br>
+ }\<br>+}<br> <br>+FRAME_INIT_SSE(sse2)<br>+FRAME_INIT_SSE(ssse3)<br>+<br> void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )<br> {<br> if( !(cpu&X264_CPU_MMX) )<br>@@ -169,6 +248,19 @@<br> <br> pf->prefetch_fenc = x264_prefetch_fenc_mmxext;<br>
pf->prefetch_ref = x264_prefetch_ref_mmxext;<br>-<br>- /* todo: use sse2 */<br>+ // disable on AMD processors since it is slower<br>+ if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) )<br>
+ {<br>+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;<br>+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;<br>+ pf->mc_luma = mc_luma_sse2;<br>+ pf->get_ref = get_ref_sse2;<br>
+ pf->frame_init_lowres_core = frame_init_lowres_core_sse2;<br>+ }<br>+ #ifdef HAVE_SSE3<br>+ if( cpu&X264_CPU_SSSE3 )<br>+ {<br>+ pf->frame_init_lowres_core = frame_init_lowres_core_ssse3;<br>
+ }<br>+#endif //HAVE_SSE3<br> }<br>Index: common/i386/dct-a.asm<br>===================================================================<br>--- common/i386/dct-a.asm (revision 745)<br>+++ common/i386/dct-a.asm (working copy)<br>
@@ -711,7 +711,7 @@<br> MMX_STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*6], xmm6, xmm7<br> MMX_STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*7], xmm6, xmm7<br> ret<br>-<br>+ <br> ;-----------------------------------------------------------------------------<br>
; void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4],<br> ; uint8_t *pix1, uint8_t *pix2 )<br>Index: common/i386/mc-a2.asm<br>===================================================================<br>
--- common/i386/mc-a2.asm (revision 745)<br>+++ common/i386/mc-a2.asm (working copy)<br>@@ -33,6 +33,7 @@<br> SECTION_RODATA<br> <br> ALIGN 16<br>+pw_255: times 8 dw 255<br> pw_1: times 4 dw 1<br> pw_16: times 4 dw 16<br>
pw_32: times 4 dw 32<br>@@ -324,4 +325,266 @@<br> pop edi<br> emms<br> ret<br>+ <br>+%macro FILTER_START 2<br>+ picgetgot eax<br>+ mov%2 %1, [pw_255 GLOBAL]<br>+ push ebx<br>+ push edi<br>+ push esi<br>
+ push ebp<br>+ mov ebp, [esp+4+16 ] ;src_stride <br>+ mov eax, [esp+20+16] ;source<br>+ mov ebx, [esp+24+16] ;dest0<br>+ mov ecx, [esp+28+16] ;desth<br>+ mov edx, [esp+32+16] ;destv<br>
+ mov edi, [esp+36+16] ;destc<br>+ mov esi, [esp+8+16 ] ;dest_stride<br>+%endmacro<br> <br>+%macro FILTER_END 0<br>+ pop ebp<br>+ pop esi<br>+ pop edi<br>+ pop ebx<br>+ ret<br>+%endmacro<br>
+<br>+%macro FILTER_PREFETCH 1<br>+ prefetch [eax+ebp*%1]<br>+ prefetch [ebx+esi*%1]<br>+ prefetch [ecx+esi*%1]<br>+ prefetch [edx+esi*%1]<br>+ prefetch [edi+esi*%1]<br>+%endmacro<br>+<br>+%macro INIT_LOAD 3<br>
+ mov%3 %1, [eax+1+%2]<br>+ pavgb %1, [eax+%2]<br>+%endmacro<br>+<br>+%macro WIDTH_FILTER 11<br>+ mov%8 %1, [eax+1+ebp+%6]<br>+ pavgb %1, [eax+ebp+%6]<br>+ mov%9 %2, %1<br>+ pavgb %1, %3<br>+ mov%9 %3, %2<br>
+ pand %1, %11<br>+ psrlw %2, 8<br>+ packuswb %1, %1<br>+ packuswb %2, %2<br>+ mov%10 [%4+%7], %1<br>+ mov%10 [%5+%7], %2<br>+%endmacro<br>+<br>+%macro WIDTH16_FILTER_SSSE3 7<br>+ movdqa %2, [eax+ebp+%6]<br>
+ movdqa xmm6, [eax+16+ebp+%6]<br>+ movdqa %1, %2<br>+ palignr %2, xmm6, 1<br>+ movdqa %2, %1<br>+ pavgb %1, %3<br>+ movdqa %3, %2<br>+ pand %1, xmm7<br>+ psrlw %2, 8<br>+ packuswb %1, %1<br>+ packuswb %2, %2<br>
+ movq [%4+%7], %1<br>+ movq [%5+%7], %2<br>+%endmacro<br>+<br>+%macro WIDTH8_FILTER_MMX 7<br>+ WIDTH_FILTER %1, %2, %3, %4, %5, %6, %7, q, q, d, mm7<br>+%endmacro<br>+<br>+%macro WIDTH16_FILTER_SSE2 7<br>+ WIDTH_FILTER %1, %2, %3, %4, %5, %6, %7, dqu, dqa, q, xmm7<br>
+%endmacro<br>+<br>+%macro LOWRES_FILTER_STEP_W64_SSE 3<br>+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0<br>+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm3, %1, %2, 16, 8<br>+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm4, %1, %2, 32, 16<br>
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm5, %1, %2, 48, 24<br>+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0<br>+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm3, %1, %2, 16, 8<br>+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm4, %1, %2, 32, 16<br>
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm5, %1, %2, 48, 24<br>+ add eax, ebp<br>+ add %1, esi<br>+ add %2, esi<br>+%endmacro<br>+<br>+%macro LOWRES_FILTER_STEP_W32_SSE 3<br>+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0<br>
+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm3, %1, %2, 16, 8<br>+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0<br>+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm3, %1, %2, 16, 8<br>+ add eax, ebp<br>+ add %1, esi<br>+ add %2, esi<br>
+%endmacro<br>+<br>+%macro LOWRES_FILTER_STEP_W16_SSE 3<br>+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0<br>+ WIDTH16_FILTER_%3 xmm0, xmm1, xmm2, %1, %2, 0, 0<br>+ add eax, ebp<br>+ add %1, esi<br>+ add %2, esi<br>
+%endmacro<br>+<br>+;-----------------------------------------------------------------------------<br>+; void frame_init_lowres_core_sse2_w64( int src_stride, int dest_stride, int height, int width, <br>+; uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )<br>
+;-----------------------------------------------------------------------------<br>+cglobal frame_init_lowres_core_sse2_w64<br>+ FILTER_START xmm7, dqa<br>+ INIT_LOAD xmm2, 0, dqu<br>+ INIT_LOAD xmm3, 16, dqu<br>
+ INIT_LOAD xmm4, 32, dqu<br>+ INIT_LOAD xmm5, 48, dqu<br>+ .vloop:<br>+ FILTER_PREFETCH 2<br>+ LOWRES_FILTER_STEP_W64_SSE ebx, ecx, SSE2<br>+ LOWRES_FILTER_STEP_W64_SSE edx, edi, SSE2<br>+ dec dword [esp+12+16]<br>
+ jg .vloop<br>+ FILTER_END<br>+<br>+;-----------------------------------------------------------------------------<br>+; void frame_init_lowres_core_sse2_w32( int src_stride, int dest_stride, int height, int width, <br>
+; uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )<br>+;-----------------------------------------------------------------------------<br>+cglobal frame_init_lowres_core_sse2_w32<br>
+ FILTER_START xmm7, dqa<br>+ INIT_LOAD xmm2, 0, dqu<br>+ INIT_LOAD xmm3, 16, dqu<br>+ .vloop:<br>+ FILTER_PREFETCH 2<br>+ LOWRES_FILTER_STEP_W32_SSE ebx, ecx, SSE2<br>+ LOWRES_FILTER_STEP_W32_SSE edx, edi, SSE2<br>
+ dec dword [esp+12+16]<br>+ jg .vloop<br>+ FILTER_END<br>+<br>+;-----------------------------------------------------------------------------<br>+; void frame_init_lowres_core_sse2_w16( int src_stride, int dest_stride, int height, int width, <br>
+; uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )<br>+;-----------------------------------------------------------------------------<br>+cglobal frame_init_lowres_core_sse2_w16<br>
+ FILTER_START xmm7, dqa<br>+ INIT_LOAD xmm2, 0, dqu<br>+ .vloop:<br>+ FILTER_PREFETCH 4<br>+ LOWRES_FILTER_STEP_W16_SSE ebx, ecx, SSE2<br>+ LOWRES_FILTER_STEP_W16_SSE edx, edi, SSE2<br>+ dec dword [esp+12+16]<br>
+ jg .vloop<br>+ FILTER_END<br>+<br>+%macro LOWRES_FILTER_STEP_W32_MMX 2<br>+ WIDTH8_FILTER_MMX mm0, mm1, mm2, %1, %2, 0, 0<br>+ WIDTH8_FILTER_MMX mm0, mm1, mm3, %1, %2, 8, 4<br>+ WIDTH8_FILTER_MMX mm0, mm1, mm4, %1, %2, 16, 8<br>
+ WIDTH8_FILTER_MMX mm0, mm1, mm5, %1, %2, 24, 12<br>+ WIDTH8_FILTER_MMX mm0, mm1, mm2, %1, %2, 0, 0<br>+ WIDTH8_FILTER_MMX mm0, mm1, mm3, %1, %2, 8, 4<br>+ WIDTH8_FILTER_MMX mm0, mm1, mm4, %1, %2, 16, 8<br>+ WIDTH8_FILTER_MMX mm0, mm1, mm5, %1, %2, 24, 12<br>
+ add eax, ebp<br>+ add %1, esi<br>+ add %2, esi<br>+%endmacro<br>+<br>+%macro LOWRES_FILTER_STEP_W16_MMX 2<br>+ WIDTH8_FILTER_MMX mm0, mm1, mm2, %1, %2, 0, 0<br>+ WIDTH8_FILTER_MMX mm0, mm1, mm3, %1, %2, 8, 4<br>
+ WIDTH8_FILTER_MMX mm0, mm1, mm2, %1, %2, 0, 0<br>+ WIDTH8_FILTER_MMX mm0, mm1, mm3, %1, %2, 8, 4<br>+ add eax, ebp<br>+ add %1, esi<br>+ add %2, esi<br>+%endmacro<br>+<br>+;-----------------------------------------------------------------------------<br>
+; void frame_init_lowres_core_mmx_w32( int src_stride, int dest_stride, int height, int width, <br>+; uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )<br>
+;-----------------------------------------------------------------------------<br>+cglobal frame_init_lowres_core_mmx_w32<br>+ FILTER_START mm7, q<br>+ INIT_LOAD mm2, 0, q<br>+ INIT_LOAD mm3, 8, q<br>+ INIT_LOAD mm4, 16, q<br>
+ INIT_LOAD mm5, 24, q<br>+ .vloop:<br>+ FILTER_PREFETCH 2<br>+ LOWRES_FILTER_STEP_W32_MMX ebx, ecx<br>+ LOWRES_FILTER_STEP_W32_MMX edx, edi<br>+ dec dword [esp+12+16]<br>+ jg .vloop<br>
+ FILTER_END<br>+<br>+;-----------------------------------------------------------------------------<br>+; void frame_init_lowres_core_mmx_w16( int src_stride, int dest_stride, int height, int width, <br>+; uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )<br>
+;-----------------------------------------------------------------------------<br>+cglobal frame_init_lowres_core_mmx_w16<br>+ FILTER_START mm7, q<br>+ INIT_LOAD mm2, 0, q<br>+ INIT_LOAD mm3, 8, q<br>+ .vloop:<br>
+ FILTER_PREFETCH 2<br>+ LOWRES_FILTER_STEP_W16_MMX ebx, ecx<br>+ LOWRES_FILTER_STEP_W16_MMX edx, edi<br>+ dec dword [esp+12+16]<br>+ jg .vloop<br>+ FILTER_END<br>+ <br>+;-----------------------------------------------------------------------------<br>
+; void frame_init_lowres_core_ssse3_w64( int src_stride, int dest_stride, int height, int width, <br>+; uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )<br>
+;-----------------------------------------------------------------------------<br>+cglobal frame_init_lowres_core_ssse3_w64<br>+ FILTER_START xmm7, dqa<br>+ INIT_LOAD xmm2, 0, dqu<br>+ INIT_LOAD xmm3, 16, dqu<br>
+ INIT_LOAD xmm4, 32, dqu<br>+ INIT_LOAD xmm5, 48, dqu<br>+ .vloop:<br>+ FILTER_PREFETCH 2<br>+ LOWRES_FILTER_STEP_W64_SSE ebx, ecx, SSSE3<br>+ LOWRES_FILTER_STEP_W64_SSE edx, edi, SSSE3<br>+ dec dword [esp+12+16]<br>
+ jg .vloop<br>+ FILTER_END<br>+<br>+;-----------------------------------------------------------------------------<br>+; void frame_init_lowres_core_ssse3_w32( int src_stride, int dest_stride, int height, int width, <br>
+; uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )<br>+;-----------------------------------------------------------------------------<br>+cglobal frame_init_lowres_core_ssse3_w32<br>
+ FILTER_START xmm7, dqa<br>+ INIT_LOAD xmm2, 0, dqu<br>+ INIT_LOAD xmm3, 16, dqu<br>+ .vloop:<br>+ FILTER_PREFETCH 2<br>+ LOWRES_FILTER_STEP_W32_SSE ebx, ecx, SSSE3<br>+ LOWRES_FILTER_STEP_W32_SSE edx, edi, SSSE3<br>
+ dec dword [esp+12+16]<br>+ jg .vloop<br>+ FILTER_END<br>+<br>+;-----------------------------------------------------------------------------<br>+; void frame_init_lowres_core_ssse3_w16( int src_stride, int dest_stride, int height, int width, <br>
+; uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc )<br>+;-----------------------------------------------------------------------------<br>+cglobal frame_init_lowres_core_ssse3_w16<br>
+ FILTER_START xmm7, dqa<br>+ INIT_LOAD xmm2, 0, dqu<br>+ .vloop:<br>+ FILTER_PREFETCH 4<br>+ LOWRES_FILTER_STEP_W16_SSE ebx, ecx, SSSE3<br>+ LOWRES_FILTER_STEP_W16_SSE edx, edi, SSSE3<br>+ dec dword [esp+12+16]<br>
+ jg .vloop<br>+ FILTER_END<br>\ No newline at end of file<br>Index: tools/checkasm.c<br>===================================================================<br>--- tools/checkasm.c (revision 745)<br>+++ tools/checkasm.c (working copy)<br>
@@ -407,8 +407,8 @@<br> uint8_t *src = &buf1[2*32+2];<br> uint8_t *src2[4] = { &buf1[2*32+2], &buf1[6*32+2],<br> &buf1[10*32+2], &buf1[14*32+2] };<br>- uint8_t *dst1 = &buf3[2*32+2];<br>
- uint8_t *dst2 = &buf4[2*32+2];<br>+ uint8_t *dst1 = &buf3[2*32];<br>+ uint8_t *dst2 = &buf4[2*32];<br> <br> int dx, dy, i, j, w;<br> int ret = 0, ok, used_asm;<br>@@ -519,7 +519,43 @@<br>
for( w = -64; w <= 128 && ok; w++ )<br> MC_TEST_AVG( avg_weight, w );<br> report( "mc wpredb :" );<br>-<br>+ <br>+ DECLARE_ALIGNED( uint8_t, src1[64*64], 16 );<br>+ DECLARE_ALIGNED( uint8_t, dst1a[64*64], 16 );<br>
+ DECLARE_ALIGNED( uint8_t, dst2a[64*64], 16 );<br>+ DECLARE_ALIGNED( uint8_t, dst3a[64*64], 16 );<br>+ DECLARE_ALIGNED( uint8_t, dst4a[64*64], 16 );<br>+ DECLARE_ALIGNED( uint8_t, dst1b[64*64], 16 );<br>+ DECLARE_ALIGNED( uint8_t, dst2b[64*64], 16 );<br>
+ DECLARE_ALIGNED( uint8_t, dst3b[64*64], 16 );<br>+ DECLARE_ALIGNED( uint8_t, dst4b[64*64], 16 );<br>+ #define MC_TEST_LOWRES(w,h) \<br>+ if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core ) \<br>
+ { \<br>+ used_asm = 1; \<br>+ memset(src1, 0xCD, w*h); \<br>+ mc_c.frame_init_lowres_core( w, w/2, h/2, w/2, src1, dst1a, dst2a, dst3a, dst4a);\<br>+ mc_a.frame_init_lowres_core( w, w/2, h/2, w/2, src1, dst1b, dst2b, dst3b, dst4b);\<br>
+ if( memcmp( dst1a, dst1b, w*h/4 ) && memcmp( dst2a, dst2b, w*h/4 )\<br>+ && memcmp( dst3a, dst3b, w*h/4 ) && memcmp( dst4a, dst4b, w*h/4 )) \<br>+ { \<br>+ ok = 0; \<br>
+ } \<br>+ }<br>+ for( j = 0; j < 1000; j++)<br>+ {<br>+ for( i = 0; i < 64*64; i++ )<br>+ {<br>+ src1[i] = rand() & 0xFF;<br>+ }<br>+ MC_TEST_LOWRES(16,16);<br>
+ MC_TEST_LOWRES(32,48);<br>+ MC_TEST_LOWRES(48,16);<br>+ MC_TEST_LOWRES(16,32);<br>+ MC_TEST_LOWRES(32,16);<br>+ MC_TEST_LOWRES(32,32);<br>+ }<br>+ report( "frame_init_lowres:" );<br>
return ret;<br> }<br> <br><br>