[x264-devel] commit: avg_weight_sse2 (Jason Garrett-Glaser )
git version control
git at videolan.org
Sun Jun 8 07:01:38 CEST 2008
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Fri Jun 6 22:57:33 2008 -0600| [8a74b3fdfd2004dc2788660ba7ace5aba32d013b]
avg_weight_sse2
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=8a74b3fdfd2004dc2788660ba7ace5aba32d013b
---
common/x86/deblock-a.asm | 2 +-
common/x86/mc-a.asm | 105 +++++++++++++++++++++++----------------------
common/x86/mc-c.c | 31 ++++++++++----
3 files changed, 77 insertions(+), 61 deletions(-)
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index 957d03c..2513014 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -200,7 +200,7 @@ SECTION .text
%macro SPLATW 1
%ifidn m0, xmm0
pshuflw %1, %1, 0
- punpcklqdq %1, %1
+ movlhps %1, %1
%else
pshufw %1, %1, 0
%endif
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 3dabe9f..21c7b0d 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -28,8 +28,8 @@ SECTION_RODATA
pw_4: times 4 dw 4
pw_8: times 4 dw 8
-pw_32: times 4 dw 32
-pw_64: times 4 dw 64
+pw_32: times 8 dw 32
+pw_64: times 8 dw 64
sw_64: dd 64
SECTION .text
@@ -483,33 +483,42 @@ COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
; implicit bipred only:
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
-%macro BIWEIGHT_4P_MMX 2
- movd mm0, %1
- movd mm1, %2
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- pmullw mm0, mm4
- pmullw mm1, mm5
- paddw mm0, mm1
- paddw mm0, mm6
- psraw mm0, 6
- pmaxsw mm0, mm7
- packuswb mm0, mm0
- movd %1, mm0
+%macro SPLATW 2
+%if regsize==16
+ pshuflw %1, %2, 0
+ movlhps %1, %1
+%else
+ pshufw %1, %2, 0
+%endif
+%endmacro
+
+%macro BIWEIGHT 2
+ movh m0, %1
+ movh m1, %2
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m1
+ paddw m0, m6
+ psraw m0, 6
+ pmaxsw m0, m7
+ packuswb m0, m0
+ movh %1, m0
%endmacro
-%macro BIWEIGHT_START_MMX 1
+%macro BIWEIGHT_START 1
%ifidn r4m, r4d
- movd mm4, r4m
- pshufw mm4, mm4, 0 ; weight_dst
+ movd m4, r4m
+ SPLATW m4, m4 ; weight_dst
%else
- pshufw mm4, r4m, 0
+ SPLATW m4, r4m
%endif
picgetgot r4
- movq mm5, [pw_64 GLOBAL]
- psubw mm5, mm4 ; weight_src
- movq mm6, [pw_32 GLOBAL] ; rounding
- pxor mm7, mm7
+ mova m5, [pw_64 GLOBAL]
+ psubw m5, m4 ; weight_src
+ mova m6, [pw_32 GLOBAL] ; rounding
+ pxor m7, m7
%if %1
%ifidn r5m, r5d
%define t0 r5d
@@ -524,43 +533,37 @@ COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
;-----------------------------------------------------------------------------
; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_avg_weight_w16_mmxext, 4,5
- BIWEIGHT_START_MMX 1
- BIWEIGHT_4P_MMX [r0 ], [r2 ]
- BIWEIGHT_4P_MMX [r0+ 4], [r2+ 4]
- BIWEIGHT_4P_MMX [r0+ 8], [r2+ 8]
- BIWEIGHT_4P_MMX [r0+12], [r2+12]
+cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4,1
+ BIWEIGHT_START 0
+ BIWEIGHT [r0 ], [r2 ]
+ BIWEIGHT [r0+r1 ], [r2+r3 ]
+ BIWEIGHT [r0+r1*2], [r2+r3*2]
add r0, r1
add r2, r3
- dec t0
- jg .height_loop
- REP_RET
+ BIWEIGHT [r0+r1*2], [r2+r3*2]
+ RET
-;-----------------------------------------------------------------------------
-; int x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_avg_weight_w8_mmxext, 4,5
- BIWEIGHT_START_MMX 1
- BIWEIGHT_4P_MMX [r0 ], [r2 ]
- BIWEIGHT_4P_MMX [r0+4], [r2+4]
+%macro AVG_WEIGHT 2
+cglobal x264_pixel_avg_weight_w%2_%1, 4,5
+ BIWEIGHT_START 1
+%assign x 0
+%rep %2*2/regsize
+ BIWEIGHT [r0+x], [r2+x]
+%assign x x+regsize/2
+%endrep
add r0, r1
add r2, r3
dec t0
jg .height_loop
REP_RET
+%endmacro
-;-----------------------------------------------------------------------------
-; int x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int )
-;-----------------------------------------------------------------------------
-cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4,1
- BIWEIGHT_START_MMX 0
- BIWEIGHT_4P_MMX [r0 ], [r2 ]
- BIWEIGHT_4P_MMX [r0+r1 ], [r2+r3 ]
- BIWEIGHT_4P_MMX [r0+r1*2], [r2+r3*2]
- add r0, r1
- add r2, r3
- BIWEIGHT_4P_MMX [r0+r1*2], [r2+r3*2]
- RET
+INIT_MMX
+AVG_WEIGHT mmxext, 8
+AVG_WEIGHT mmxext, 16
+INIT_XMM
+AVG_WEIGHT sse2, 8
+AVG_WEIGHT sse2, 16
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 4a6194a..4551606 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -46,6 +46,8 @@ extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
+extern void x264_pixel_avg_weight_w8_sse2( uint8_t *, int, uint8_t *, int, int, int );
+extern void x264_pixel_avg_weight_w16_sse2( uint8_t *, int, uint8_t *, int, int, int );
extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int );
extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
@@ -70,16 +72,22 @@ PIXEL_AVG_WALL(cache64_mmxext)
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
-#define AVG_WEIGHT(W,H) \
-void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
+#define AVG_WEIGHT(W,H,name) \
+void x264_pixel_avg_weight_ ## W ## x ## H ## _##name( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
{ \
- x264_pixel_avg_weight_w ## W ## _mmxext( dst, i_dst, src, i_src, i_weight_dst, H ); \
+ x264_pixel_avg_weight_w ## W ## _##name( dst, i_dst, src, i_src, i_weight_dst, H ); \
}
-AVG_WEIGHT(16,16)
-AVG_WEIGHT(16,8)
-AVG_WEIGHT(8,16)
-AVG_WEIGHT(8,8)
-AVG_WEIGHT(8,4)
+
+AVG_WEIGHT(16,16,mmxext)
+AVG_WEIGHT(16,8,mmxext)
+AVG_WEIGHT(8,16,mmxext)
+AVG_WEIGHT(8,8,mmxext)
+AVG_WEIGHT(8,4,mmxext)
+AVG_WEIGHT(16,16,sse2)
+AVG_WEIGHT(16,8,sse2)
+AVG_WEIGHT(8,16,sse2)
+AVG_WEIGHT(8,8,sse2)
+AVG_WEIGHT(8,4,sse2)
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
@@ -244,7 +252,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_mmxext;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext;
-
+
pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_mmxext;
pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_mmxext;
pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_mmxext;
@@ -285,6 +293,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
+ pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_sse2;
+ pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_sse2;
+ pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_sse2;
+ pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_sse2;
+ pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
if( cpu&X264_CPU_SSE2_IS_FAST )
More information about the x264-devel
mailing list