Index: common/mc.c<br>===================================================================<br>--- common/mc.c (revision 745)<br>+++ common/mc.c (working copy)<br>@@ -377,7 +377,9 @@<br> #ifdef HAVE_MMX<br> x264_mc_init_mmx( cpu, pf );<br>
if( cpu&X264_CPU_MMXEXT )<br>+ {<br> pf->mc_chroma = x264_mc_chroma_mmxext;<br>+ }<br> #endif<br> #ifdef ARCH_PPC<br> if( cpu&X264_CPU_ALTIVEC )<br>Index: common/i386/mc-a.asm<br>===================================================================<br>
--- common/i386/mc-a.asm (revision 745)<br>+++ common/i386/mc-a.asm (working copy)<br>@@ -128,21 +128,23 @@<br> movdqa [eax+ebx], xmm1<br> AVG_END<br> <br>-%macro AVGH 2<br>-cglobal x264_pixel_avg_%1x%2_mmxext<br>
+%macro AVGH 3<br>+cglobal x264_pixel_avg_%1x%2_%3<br> push esi<br> mov esi, %2<br>- jmp x264_pixel_avg_w%1_mmxext<br>+ jmp x264_pixel_avg_w%1_%3<br> %endmacro<br> <br>-AVGH 16, 16<br>-AVGH 16, 8<br>-AVGH 8, 16<br>
-AVGH 8, 8<br>-AVGH 8, 4<br>-AVGH 4, 8<br>-AVGH 4, 4<br>-AVGH 4, 2<br>+AVGH 16, 16, mmxext<br>+AVGH 16, 8, mmxext<br>+AVGH 8, 16, mmxext<br>+AVGH 8, 8, mmxext<br>+AVGH 8, 4, mmxext<br>+AVGH 4, 8, mmxext<br>+AVGH 4, 4, mmxext<br>
+AVGH 4, 2, mmxext<br>+AVGH 16, 16, sse2<br>+AVGH 16, 8, sse2<br> <br> %macro AVG2_START 1<br> cglobal %1<br>@@ -191,6 +193,21 @@<br> movq [eax+ebx], mm1<br> AVG2_END<br> <br>+AVG2_START x264_pixel_avg2_w12_mmxext<br>
+ movq mm0, [ecx]<br>+ movd mm1, [ecx+8]<br>+ movq mm2, [ecx+edx]<br>+ movd mm3, [ecx+edx+8]<br>+ pavgb mm0, [ecx+edi]<br>+ pavgb mm1, [ecx+edi+8]<br>+ pavgb mm2, [ecx+ebp]<br>+ pavgb mm3, [ecx+ebp+8]<br>
+ movq [eax], mm0<br>+ movd [eax+8], mm1<br>+ movq [eax+ebx], mm2<br>+ movd [eax+ebx+8], mm3<br>+AVG2_END<br>+<br> AVG2_START x264_pixel_avg2_w16_mmxext<br> movq mm0, [ecx]<br> movq mm1, [ecx+8]<br>
@@ -227,7 +244,33 @@<br> movd [eax+ebx+16], mm5<br> AVG2_END<br> <br>+AVG2_START x264_pixel_avg2_w16_sse2<br>+ movdqu xmm0, [ecx]<br>+ movdqu xmm2, [ecx+edi]<br>+ movdqu xmm1, [ecx+edx]<br>+ movdqu xmm3, [ecx+ebp]<br>
+ pavgb xmm0, xmm2<br>+ pavgb xmm1, xmm3<br>+ movdqa [eax], xmm0<br>+ movdqa [eax+ebx], xmm1<br>+AVG2_END<br> <br>+AVG2_START x264_pixel_avg2_w20_sse2<br>+ movdqu xmm0, [ecx]<br>+ movdqu xmm2, [ecx+edi]<br>
+ movdqu xmm1, [ecx+edx]<br>+ movdqu xmm3, [ecx+ebp]<br>+ movd mm2, [ecx+16]<br>+ movd mm5, [ecx+edx+16]<br>+ pavgb xmm0, xmm2<br>+ pavgb xmm1, xmm3<br>+ pavgb mm2, [ecx+edi+16]<br>+ pavgb mm5, [ecx+ebp+16]<br>
+ movdqa [eax], xmm0<br>+ movd [eax+16], mm2<br>+ movdqa [eax+ebx], xmm1<br>+ movd [eax+ebx+16], mm5<br>+AVG2_END<br> <br> ;=============================================================================<br>
; weighted prediction<br>Index: common/i386/mc-c.c<br>===================================================================<br>--- common/i386/mc-c.c (revision 745)<br>+++ common/i386/mc-c.c (working copy)<br>@@ -28,6 +28,8 @@<br>
#include "common/common.h"<br> <br> /* NASM functions */<br>+extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int );<br>+extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int );<br>
extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int );<br> extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int );<br> extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int );<br>
@@ -38,8 +40,11 @@<br> extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int );<br> extern void x264_pixel_avg2_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );<br> extern void x264_pixel_avg2_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );<br>
+extern void x264_pixel_avg2_w12_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );<br> extern void x264_pixel_avg2_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );<br> extern void x264_pixel_avg2_w20_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );<br>
+extern void x264_pixel_avg2_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );<br>+extern void x264_pixel_avg2_w20_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );<br> extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );<br>
extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );<br> extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int );<br>@@ -69,9 +74,9 @@<br> NULL,<br>
x264_pixel_avg2_w4_mmxext,<br> x264_pixel_avg2_w8_mmxext,<br>+ x264_pixel_avg2_w12_mmxext,<br> x264_pixel_avg2_w16_mmxext,<br>- x264_pixel_avg2_w16_mmxext,<br>- x264_pixel_avg2_w20_mmxext,<br>+ x264_pixel_avg2_w20_mmxext<br>
};<br> static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *, int, int ) =<br> {<br>@@ -81,56 +86,80 @@<br> NULL,<br> x264_mc_copy_w16_mmx<br> };<br>+static void (* const x264_pixel_avg_wtab_sse2[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =<br>
+{<br>+ NULL,<br>+ x264_pixel_avg2_w4_mmxext,<br>+ x264_pixel_avg2_w8_mmxext,<br>+ x264_pixel_avg2_w12_mmxext,<br>+ x264_pixel_avg2_w16_sse2,<br>+ x264_pixel_avg2_w20_sse2<br>+};<br>+static void (* const x264_mc_copy_wtab_sse2[5])( uint8_t *, int, uint8_t *, int, int ) =<br>
+{<br>+ NULL,<br>+ x264_mc_copy_w4_mmx,<br>+ x264_mc_copy_w8_mmx,<br>+ NULL,<br>+ x264_mc_copy_w16_sse2<br>+};<br> static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};<br> static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};<br>
<br>-void mc_luma_mmxext( uint8_t *dst, int i_dst_stride,<br>- uint8_t *src[4], int i_src_stride,<br>- int mvx, int mvy,<br>- int i_width, int i_height )<br>
-{<br>- int qpel_idx = ((mvy&3)<<2) + (mvx&3);<br>- int offset = (mvy>>2)*i_src_stride + (mvx>>2);<br>- uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;<br>
-<br>- if( qpel_idx & 5 ) /* qpel interpolation needed */<br>- {<br>- uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);<br>- x264_pixel_avg_wtab_mmxext[i_width>>2](<br>- dst, i_dst_stride, src1, i_src_stride,<br>
- src2, i_height );<br>- }<br>- else<br>- {<br>- x264_mc_copy_wtab_mmx[i_width>>2](<br>- dst, i_dst_stride, src1, i_src_stride, i_height );<br>- }<br>+#define MC_LUMA(name,instr1,instr2)\<br>
+void mc_luma_##name( uint8_t *dst, int i_dst_stride,\<br>+ uint8_t *src[4], int i_src_stride,\<br>+ int mvx, int mvy,\<br>+ int i_width, int i_height )\<br>+{\<br>+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);\<br>
+ int offset = (mvy>>2)*i_src_stride + (mvx>>2);\<br>+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\<br>+ if( qpel_idx & 5 ) /* qpel interpolation needed */\<br>
+ {\<br>+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\<br>+ x264_pixel_avg_wtab_##instr1[i_width>>2](\<br>+ dst, i_dst_stride, src1, i_src_stride,\<br>+ src2, i_height );\<br>
+ }\<br>+ else\<br>+ {\<br>+ x264_mc_copy_wtab_##instr2[i_width>>2](\<br>+ dst, i_dst_stride, src1, i_src_stride, i_height );\<br>+ }\<br> }<br> <br>-uint8_t *get_ref_mmxext( uint8_t *dst, int *i_dst_stride,<br>
- uint8_t *src[4], int i_src_stride,<br>- int mvx, int mvy,<br>- int i_width, int i_height )<br>-{<br>- int qpel_idx = ((mvy&3)<<2) + (mvx&3);<br>
- int offset = (mvy>>2)*i_src_stride + (mvx>>2);<br>- uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;<br>-<br>- if( qpel_idx & 5 ) /* qpel interpolation needed */<br>
- {<br>- uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);<br>- x264_pixel_avg_wtab_mmxext[i_width>>2](<br>- dst, *i_dst_stride, src1, i_src_stride,<br>- src2, i_height );<br>
- return dst;<br>- }<br>- else<br>- {<br>- *i_dst_stride = i_src_stride;<br>- return src1;<br>- }<br>+MC_LUMA(mmxext,mmxext,mmx)<br>+MC_LUMA(sse2,sse2,sse2)<br>+#include "bench.h"<br>
+#define GET_REF(name)\<br>+uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\<br>+ uint8_t *src[4], int i_src_stride,\<br>+ int mvx, int mvy,\<br>+ int i_width, int i_height )\<br>
+{\<br>+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);\<br>+ int offset = (mvy>>2)*i_src_stride + (mvx>>2);\<br>+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\<br>
+ if( qpel_idx & 5 ) /* qpel interpolation needed */\<br>+ {\<br>+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\<br>+ START_TIMER;\<br>+ x264_pixel_avg_wtab_##name[i_width>>2](\<br>
+ dst, *i_dst_stride, src1, i_src_stride,\<br>+ src2, i_height );\<br>+ if(i_width>>2 == 5) STOP_TIMER("w20");\<br>+ return dst;\<br>+ }\<br>+ else\<br>+ {\<br>
+ *i_dst_stride = i_src_stride;\<br>+ return src1;\<br>+ }\<br> }<br> <br>+GET_REF(mmxext)<br>+GET_REF(sse2)<br> <br> void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )<br> {<br>@@ -169,6 +198,12 @@<br>
<br> pf->prefetch_fenc = x264_prefetch_fenc_mmxext;<br> pf->prefetch_ref = x264_prefetch_ref_mmxext;<br>-<br>- /* todo: use sse2 */<br>+ // disable on AMD processors since it is slower<br>+ if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) )<br>
+ {<br>+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;<br>+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;<br>+ pf->mc_luma = mc_luma_sse2;<br>+ pf->get_ref = get_ref_sse2;<br>
+ }<br> }<br>Index: tools/checkasm.c<br>===================================================================<br>--- tools/checkasm.c (revision 745)<br>+++ tools/checkasm.c (working copy)<br>@@ -407,8 +407,8 @@<br>
uint8_t *src = &buf1[2*32+2];<br> uint8_t *src2[4] = { &buf1[2*32+2], &buf1[6*32+2],<br> &buf1[10*32+2], &buf1[14*32+2] };<br>- uint8_t *dst1 = &buf3[2*32+2];<br>
- uint8_t *dst2 = &buf4[2*32+2];<br>+ uint8_t *dst1 = &buf3[2*32];<br>+ uint8_t *dst2 = &buf4[2*32];<br> <br> int dx, dy, i, j, w;<br> int ret = 0, ok, used_asm;<br><br><br>