[x264-devel] x86: AVX memzero_aligned
Henrik Gramner
git at videolan.org
Tue Apr 23 23:37:10 CEST 2013
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue Apr 16 23:27:29 2013 +0200| [255271fd7999b6b7ff7d65b7b8de1a2dc8919b1a] | committer: Jason Garrett-Glaser
x86: AVX memzero_aligned
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=255271fd7999b6b7ff7d65b7b8de1a2dc8919b1a
---
common/common.h | 4 ++--
common/x86/mc-a2.asm | 14 +++++++-------
common/x86/mc-c.c | 2 ++
encoder/me.c | 2 +-
4 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/common/common.h b/common/common.h
index 53a6ff0..1732d59 100644
--- a/common/common.h
+++ b/common/common.h
@@ -770,8 +770,8 @@ struct x264_t
ALIGNED_16( dctcoef fenc_dct4[16][16] );
/* Psy RD SATD/SA8D scores cache */
- ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
- ALIGNED_16( uint32_t fenc_satd_cache[32] );
+ ALIGNED_N( uint64_t fenc_hadamard_cache[9] );
+ ALIGNED_N( uint32_t fenc_satd_cache[32] );
/* pointer over mb of the frame to be compressed */
pixel *p_fenc[3]; /* y,u,v */
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 5fbe9c1..f1f09d1 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1223,7 +1223,7 @@ MEMCPY
;-----------------------------------------------------------------------------
; void *memzero_aligned( void *dst, size_t n );
;-----------------------------------------------------------------------------
-%macro MEMZERO 0
+%macro MEMZERO 1
cglobal memzero_aligned, 2,2
add r0, r1
neg r1
@@ -1234,21 +1234,21 @@ cglobal memzero_aligned, 2,2
%endif
.loop:
%assign i 0
-%rep 8
+%rep %1
mova [r0 + r1 + i], m0
%assign i i+mmsize
%endrep
- add r1, mmsize*8
+ add r1, mmsize*%1
jl .loop
RET
%endmacro
INIT_MMX mmx
-MEMZERO
+MEMZERO 8
INIT_XMM sse
-MEMZERO
-
-
+MEMZERO 8
+INIT_YMM avx
+MEMZERO 4
%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 64c11a6..8c12deb 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -129,6 +129,7 @@ void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );
void x264_memzero_aligned_mmx( void *dst, size_t n );
void x264_memzero_aligned_sse( void *dst, size_t n );
+void x264_memzero_aligned_avx( void *dst, size_t n );
void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
@@ -798,6 +799,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_AVX) )
return;
+ pf->memzero_aligned = x264_memzero_aligned_avx;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
if( cpu&X264_CPU_FMA4 )
diff --git a/encoder/me.c b/encoder/me.c
index 6db1a28..2dc9b56 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -1058,7 +1058,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
uint64_t bcostrd = COST_MAX64;
uint16_t amvd;
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
- ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] );
+ ALIGNED_ARRAY_N( uint8_t, visited,[8],[8][8] );
/* all permutations of an offset in up to 2 of the dimensions */
ALIGNED_4( static const int8_t dia4d[33][4] ) =
{
More information about the x264-devel
mailing list