[vlc-devel] [PATCH 14/25] packetizer: add startcode_FindAnnexB AVX2 asm
Victorien Le Couviour--Tuffet
victorien.lecouviour.tuffet at gmail.com
Tue Apr 14 12:40:25 CEST 2020
---
modules/packetizer/startcode.asm | 103 ++++++++++++++++++++++++++
modules/packetizer/startcode_helper.h | 9 +++
test/modules/packetizer/helpers.c | 10 +++
3 files changed, 122 insertions(+)
diff --git a/modules/packetizer/startcode.asm b/modules/packetizer/startcode.asm
index cac5eea116..a74463dc1b 100644
--- a/modules/packetizer/startcode.asm
+++ b/modules/packetizer/startcode.asm
@@ -171,3 +171,106 @@ DEFINE_ARGS ptr, _, size, tmp
add r3d, r4d
lea rax, [ptrq+r3q]
RET
+
+%if ARCH_X86_64
+
+INIT_YMM avx2
+cglobal startcode_FindAnnexB, 2, 7, 5, ptr, end, size
+%define base r6q-shuf_pad2dw
+ lea r6q, [shuf_pad2dw]
+ vbroadcasti128 m0, [base+shuf_pad2dw]
+ vbroadcasti128 m1, [base+pd_0x00010000]
+ mov sizeq, endq
+ sub sizeq, ptrq
+ cmp sizeq, 28
+ jl .xmm
+
+.loop:
+ movu xm2, [ptrq+ 0]
+ vinserti128 m2, [ptrq+12], 1
+ psrldq m3, m2, 1
+ psrldq m4, m2, 2
+ pshufb m2, m0
+ pshufb m3, m0
+ pshufb m4, m0
+ pcmpeqd m2, m1
+ pcmpeqd m3, m1
+ pcmpeqd m4, m1
+ movmskps r3d, m2
+ movmskps r4d, m3
+ movmskps r5d, m4
+ tzcnt r3d, r3d
+ tzcnt r4d, r4d
+ tzcnt r5d, r5d
+ cmp r4d, r3d
+ cmovle r3d, r4d
+ setle r4b
+ cmp r5d, r3d
+ cmovle r3d, r5d
+ setle r5b
+ test r3d, 32
+ jz .found
+ add ptrq, 24
+ sub sized, 24
+ cmp sized, 28
+ jge .loop
+
+INIT_XMM avx2
+.xmm:
+ cmp sized, 16
+ jl .end
+ movu m2, [ptrq]
+ psrldq m3, m2, 1
+ psrldq m4, m2, 2
+ pshufb m2, m0
+ pshufb m3, m0
+ pshufb m4, m0
+ pcmpeqd m2, m1
+ pcmpeqd m3, m1
+ pcmpeqd m4, m1
+ movmskps r3d, m2
+ movmskps r4d, m3
+ movmskps r5d, m4
+ tzcnt r3d, r3d
+ tzcnt r4d, r4d
+ tzcnt r5d, r5d
+ cmp r4d, r3d
+ cmovle r3d, r4d
+ setle r4b
+ cmp r5d, r3d
+ cmovle r3d, r5d
+ setle r5b
+ test r3d, 32
+ jnz .end
+
+.found:
+ lea r3d, [r3d*3]
+ mov r6d, r4d
+ xor r4d, r5d
+ and r4d, r6d
+ shl r5d, 1
+ or r4d, r5d
+ add r3d, r4d
+ lea rax, [ptrq+r3q]
+ RET
+
+DEFINE_ARGS ptr, _, size, tmp
+.end:
+ sub sized, 3
+ jl .ret_null
+.end_loop:
+ xor tmpd, tmpd
+ test word [ptrq], 0xFFFF
+ cmovz tmpw, [ptrq+1]
+ xor tmpd, 0x0100
+ jz .ret
+ inc ptrq
+ dec sized
+ jge .end_loop
+.ret_null:
+ xor ptrq, ptrq
+.ret:
+ mov rax, ptrq
+ RET
+
+%endif
diff --git a/modules/packetizer/startcode_helper.h b/modules/packetizer/startcode_helper.h
index 73425e9970..d325aca7d7 100644
--- a/modules/packetizer/startcode_helper.h
+++ b/modules/packetizer/startcode_helper.h
@@ -72,12 +72,21 @@ uint8_t const *vlcpriv_startcode_FindAnnexB_sse2(uint8_t const *ptr,
uint8_t const *end);
uint8_t const *vlcpriv_startcode_FindAnnexB_ssse3(uint8_t const *ptr,
uint8_t const *end);
+# ifdef __x86_64__
+uint8_t const *vlcpriv_startcode_FindAnnexB_avx2(uint8_t const *ptr,
+ uint8_t const *end);
+# endif
#endif
static inline block_startcode_helper_t
startcode_FindAnnexB_helper(void)
{
#if defined(__i386__) || defined(__x86_64__)
+# ifdef __x86_64__
+ if (vlc_CPU_AVX2())
+ return vlcpriv_startcode_FindAnnexB_avx2;
+ else
+# endif
if (vlc_CPU_SSSE3())
return vlcpriv_startcode_FindAnnexB_ssse3;
else if (vlc_CPU_SSE2())
diff --git a/test/modules/packetizer/helpers.c b/test/modules/packetizer/helpers.c
index 64e982c2d5..ebca21ae22 100644
--- a/test/modules/packetizer/helpers.c
+++ b/test/modules/packetizer/helpers.c
@@ -94,6 +94,16 @@ static int run_annexb_sets( const uint8_t *p_set, const uint8_t *p_end,
if( i_ret != 0 )
return i_ret;
}
+# ifdef __x86_64__
+ if (vlc_CPU_AVX2())
+ {
+ printf("checking AVX2 asm:\n");
+ i_ret = check_set( p_set, p_end, p_results, i_results, i_results_offset,
+ vlcpriv_startcode_FindAnnexB_avx2 );
+ if( i_ret != 0 )
+ return i_ret;
+ }
+# endif
#endif
return 0;
--
2.24.1
More information about the vlc-devel
mailing list