[x264-devel] x86: AVX2 nal_escape
Henrik Gramner
git at videolan.org
Tue Apr 23 23:37:10 CEST 2013
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue Apr 16 23:27:32 2013 +0200| [c3711285a6dd1343197ac3e53bb95acf99c6cb42] | committer: Jason Garrett-Glaser
x86: AVX2 nal_escape
Also rewrite the entire function to be faster and drop the AVX version which is no longer useful.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=c3711285a6dd1343197ac3e53bb95acf99c6cb42
---
common/bitstream.c | 9 ++--
common/x86/bitstream-a.asm | 111 +++++++++++++++++++++++---------------------
encoder/encoder.c | 8 ++--
x264.h | 2 +-
4 files changed, 69 insertions(+), 61 deletions(-)
diff --git a/common/bitstream.c b/common/bitstream.c
index 72a24ef..8f49a47 100644
--- a/common/bitstream.c
+++ b/common/bitstream.c
@@ -41,7 +41,7 @@ static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
-uint8_t *x264_nal_escape_avx( uint8_t *dst, uint8_t *src, uint8_t *end );
+uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end );
void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
@@ -132,8 +132,11 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt;
}
}
+
+ if( cpu&X264_CPU_AVX2 )
+ {
+ pf->nal_escape = x264_nal_escape_avx2;
+ }
#endif
- if( cpu&X264_CPU_AVX )
- pf->nal_escape = x264_nal_escape_avx;
#endif
}
diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
index 16061f2..2e6f096 100644
--- a/common/x86/bitstream-a.asm
+++ b/common/x86/bitstream-a.asm
@@ -4,7 +4,7 @@
;* Copyright (C) 2010-2013 x264 project
;*
;* Authors: Jason Garrett-Glaser <darkshikari at gmail.com>
-;* Henrik Gramner <hengar-6 at student.ltu.se>
+;* Henrik Gramner <henrik at gramner.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
@@ -32,100 +32,105 @@ SECTION .text
;-----------------------------------------------------------------------------
; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
;-----------------------------------------------------------------------------
-
%macro NAL_LOOP 2
-%1_escape:
+%%escape:
; Detect false positive to avoid unneccessary escape loop
xor r3d, r3d
cmp byte [r0+r1-1], 0
setnz r3b
- xor r3d, r4d
+ xor k3, k4
jnz .escape
- jmp %1_continue
+ jmp %%continue
ALIGN 16
%1:
- pcmpeqb m3, m1, m4
- pcmpeqb m2, m0, m4
- pmovmskb r3d, m3
- %2 [r0+r1], m0
+ mova [r0+r1+mmsize], m1
+ pcmpeqb m1, m0
+ mova [r0+r1], m2
+ pcmpeqb m2, m0
+ pmovmskb r3d, m1
+ %2 m1, [r1+r2+3*mmsize]
pmovmskb r4d, m2
- shl r3d, mmsize
- mova m0, [r1+r2+2*mmsize]
- or r4d, r3d
- %2 [r0+r1+mmsize], m1
- lea r3d, [r4+r4+1]
- mova m1, [r1+r2+3*mmsize]
- and r4d, r3d
- jnz %1_escape
-%1_continue:
+ %2 m2, [r1+r2+2*mmsize]
+ shl k3, mmsize
+ or k3, k4
+ lea k4, [2*r3+1]
+ and k4, k3
+ jnz %%escape
+%%continue:
add r1, 2*mmsize
jl %1
%endmacro
%macro NAL_ESCAPE 0
+%if mmsize == 32
+ %xdefine k3 r3
+ %xdefine k4 r4
+%else
+ %xdefine k3 r3d
+ %xdefine k4 r4d
+%endif
cglobal nal_escape, 3,5
- mov r3w, [r1]
+ movzx r3d, byte [r1]
sub r1, r2 ; r1 = offset of current src pointer from end of src
- pxor m4, m4
+ pxor m0, m0
+ mov [r0], r3b
sub r0, r1 ; r0 = projected end of dst, assuming no more escapes
- mov [r0+r1], r3w
- add r1, 2
- jge .ret
+ or r3d, 0xffffff00 ; ignore data before src
- ; Start off by jumping into the escape loop in
- ; case there's an escape at the start.
- ; And do a few more in scalar until src is aligned again.
- jmp .first_escape
+ ; Start off by jumping into the escape loop in case there's an escape at the start.
+ ; And do a few more in scalar until dst is aligned.
+ jmp .escape_loop
+%if mmsize == 16
NAL_LOOP .loop_aligned, mova
-%if mmsize==16
jmp .ret
- NAL_LOOP .loop_unaligned, movu
%endif
+ NAL_LOOP .loop_unaligned, movu
.ret:
movifnidn rax, r0
RET
-ALIGN 16
.escape:
; Skip bytes that are known to be valid
- and r4d, r3d
- tzcnt r3d, r4d
- add r1, r3
+ and k4, k3
+ tzcnt k4, k4
+ xor r3d, r3d ; the last two bytes are known to be zero
+ add r1, r4
.escape_loop:
inc r1
jge .ret
-.first_escape:
- movzx r3d, byte [r1+r2]
- lea r4, [r1+r2]
- cmp r3d, 3
- jna .escape_check
-.no_escape:
+ movzx r4d, byte [r1+r2]
+ shl r3d, 8
+ or r3d, r4d
+ test r3d, 0xfffffc ; if the last two bytes are 0 and the current byte is <=3
+ jz .add_escape_byte
+.escaped:
+ lea r4d, [r0+r1]
mov [r0+r1], r3b
- test r4d, mmsize-1 ; Do SIMD when src is aligned
+ test r4d, mmsize-1 ; Do SIMD when dst is aligned
jnz .escape_loop
- mova m0, [r4]
- mova m1, [r4+mmsize]
-%if mmsize==16
- lea r4d, [r0+r1]
+ movu m1, [r1+r2+mmsize]
+ movu m2, [r1+r2]
+%if mmsize == 16
+ lea r4d, [r1+r2]
test r4d, mmsize-1
- jnz .loop_unaligned
+ jz .loop_aligned
%endif
- jmp .loop_aligned
+ jmp .loop_unaligned
-ALIGN 16
-.escape_check:
- cmp word [r0+r1-2], 0
- jnz .no_escape
+.add_escape_byte:
mov byte [r0+r1], 3
- inc r0
- jmp .no_escape
+ inc r0
+ or r3d, 0x0300
+ jmp .escaped
%endmacro
INIT_MMX mmx2
NAL_ESCAPE
INIT_XMM sse2
NAL_ESCAPE
-INIT_XMM avx
+%if ARCH_X86_64
+INIT_YMM avx2
NAL_ESCAPE
+%endif
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 4214255..2e59bf5 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1377,7 +1377,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
* ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min )
: pow( 0.95, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor )));
- h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4;
+ h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4 + 64; /* +4 for startcode, +64 for nal_escape assembly padding */
CHECKED_MALLOC( h->nal_buffer, h->nal_buffer_size );
if( h->param.i_threads > 1 &&
@@ -1625,9 +1625,9 @@ static int x264_nal_end( x264_t *h )
x264_nal_t *nal = &h->out.nal[h->out.i_nal];
uint8_t *end = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
nal->i_payload = end - nal->p_payload;
- /* nal_escape_mmx reads past the end of the input.
+ /* Assembly implementation of nal_escape reads past the end of the input.
* While undefined padding wouldn't actually affect the output, it makes valgrind unhappy. */
- memset( end, 0xff, 32 );
+ memset( end, 0xff, 64 );
if( h->param.nalu_process )
h->param.nalu_process( h, nal, h->fenc->opaque );
h->out.i_nal++;
@@ -1653,7 +1653,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start )
nal_size += h->out.nal[i].i_payload;
/* Worst-case NAL unit escaping: reallocate the buffer if it's too small. */
- int necessary_size = nal_size * 3/2 + h->out.i_nal * 4;
+ int necessary_size = nal_size * 3/2 + h->out.i_nal * 4 + 4 + 64;
if( h->nal_buffer_size < necessary_size )
{
h->nal_buffer_size = necessary_size * 2;
diff --git a/x264.h b/x264.h
index 43cb838..3cd1f67 100644
--- a/x264.h
+++ b/x264.h
@@ -499,7 +499,7 @@ typedef struct x264_param_t
* is done encoding.
*
* This callback MUST do the following in order to work correctly:
- * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 16.
+ * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 64.
* 2) Call x264_nal_encode( h, dst, nal ), where dst is the output buffer.
* After these steps, the content of nal is valid and can be used in the same way as if
* the NAL unit were output by x264_encoder_encode.
More information about the x264-devel
mailing list