[vlc-devel] [PATCH 5/19] packetizer/startcode_helper: enhance with AVX2
Lyndon Brown
jnqnfe at gmail.com
Thu Sep 24 21:36:14 CEST 2020
From: Lyndon Brown <jnqnfe at gmail.com>
Date: Fri, 25 Jan 2019 06:05:54 +0000
Subject: packetizer/startcode_helper: enhance with AVX2
diff --git a/modules/packetizer/startcode_helper.h b/modules/packetizer/startcode_helper.h
index 41ba9b2fbf..bd19855a48 100644
--- a/modules/packetizer/startcode_helper.h
+++ b/modules/packetizer/startcode_helper.h
@@ -1,7 +1,7 @@
/*****************************************************************************
* startcode_helper.h: Startcodes helpers
*****************************************************************************
- * Copyright (C) 2016, 2020 VideoLAN Authors
+ * Copyright (C) 2016, 2019, 2020 VideoLAN Authors
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
@@ -22,7 +22,7 @@
#include <vlc_cpu.h>
-#ifdef CAN_COMPILE_SSE2
+#if defined(CAN_COMPILE_SSE2) || defined(CAN_COMPILE_AVX2)
# ifdef __has_attribute
# if __has_attribute(__vector_size__)
# define HAS_ATTRIBUTE_VECTORSIZE
@@ -30,7 +30,12 @@
# endif
# ifdef HAS_ATTRIBUTE_VECTORSIZE
- typedef unsigned char v16qu __attribute__((__vector_size__(16)));
+# ifdef CAN_COMPILE_SSE2
+ typedef unsigned char v16qu __attribute__((__vector_size__(16)));
+# endif
+# ifdef CAN_COMPILE_AVX2
+ typedef unsigned char v32qu __attribute__((__vector_size__(32)));
+# endif
# endif
#endif
@@ -52,6 +57,80 @@
}\
}
+#ifdef CAN_COMPILE_AVX2
+
+__attribute__ ((__target__ ("avx2")))
+static inline const uint8_t * startcode_FindAnnexB_AVX2( const uint8_t *p, const uint8_t *end )
+{
+ /* First align to 32 */
+ const uint8_t *alignedend = p + 32 - ((intptr_t)p & 31);
+ for (end -= 3; p < alignedend && p <= end; p++) {
+ if (p[0] == 0 && p[1] == 0 && p[2] == 1)
+ return p;
+ }
+
+ if( p == end )
+ return NULL;
+
+ alignedend = end - ((intptr_t) end & 31);
+ if( alignedend > p )
+ {
+# ifdef HAS_ATTRIBUTE_VECTORSIZE
+ const v32qu zeros = { 0 };
+# endif
+
+ for( ; p < alignedend; p += 32)
+ {
+ uint32_t match;
+# ifdef HAS_ATTRIBUTE_VECTORSIZE
+ __asm__ volatile(
+ "vmovdqa 0(%[v]), %%ymm0\n"
+ "vpcmpeqb %[czero], %%ymm0, %%ymm0\n"
+ "vpmovmskb %%ymm0, %[match]\n" /* mask will be in reversed match order */
+ : [match]"=r"(match)
+ : [v]"r"(p), [czero]"x"(zeros)
+ : "ymm0"
+ );
+# else
+ __asm__ volatile(
+ "vmovdqa 0(%[v]), %%ymm0\n"
+ "vpxor %%ymm1, %%ymm1, %%ymm1\n"
+ "vpcmpeqb %%ymm1, %%ymm0, %%ymm0\n"
+ "vpmovmskb %%ymm0, %[match]\n" /* mask will be in reversed match order */
+ : [match]"=r"(match)
+ : [v]"r"(p)
+ : "ymm0", "ymm1"
+ );
+# endif
+ if( match & 0x0000000F )
+ TRY_MATCH(p, 0);
+ if( match & 0x000000F0 )
+ TRY_MATCH(p, 4);
+ if( match & 0x00000F00 )
+ TRY_MATCH(p, 8);
+ if( match & 0x0000F000 )
+ TRY_MATCH(p, 12);
+ if( match & 0x000F0000 )
+ TRY_MATCH(p, 16);
+ if( match & 0x00F00000 )
+ TRY_MATCH(p, 20);
+ if( match & 0x0F000000 )
+ TRY_MATCH(p, 24);
+ if( match & 0xF0000000 )
+ TRY_MATCH(p, 28);
+ }
+ }
+
+ for (; p <= end; p++) {
+ if (p[0] == 0 && p[1] == 0 && p[2] == 1)
+ return p;
+ }
+
+ return NULL;
+}
+
+#endif
+
#ifdef CAN_COMPILE_SSE2
__attribute__ ((__target__ ("sse2")))
@@ -152,6 +231,10 @@ static inline const uint8_t * startcode_FindAnnexB_Bits( const uint8_t *p, const
static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint8_t *end )
{
+#ifdef CAN_COMPILE_AVX2
+ if (vlc_CPU_AVX2())
+ return startcode_FindAnnexB_AVX2(p, end);
+#endif
#ifdef CAN_COMPILE_SSE2
if (vlc_CPU_SSE2())
return startcode_FindAnnexB_SSE2(p, end);
diff --git a/test/modules/packetizer/helpers.c b/test/modules/packetizer/helpers.c
index 475644d2ad..957298d630 100644
--- a/test/modules/packetizer/helpers.c
+++ b/test/modules/packetizer/helpers.c
@@ -93,6 +93,21 @@ static int run_annexb_sets( const uint8_t *p_set, const uint8_t *p_end,
printf("sse2 simd asm not built in, skipping test:\n");
#endif
+#ifdef CAN_COMPILE_AVX2
+ if (vlc_CPU_AVX2())
+ {
+ printf("checking avx2 simd asm:\n");
+ i_ret = check_set( p_set, p_end, p_results, i_results, i_results_offset,
+ startcode_FindAnnexB_AVX2 );
+ if( i_ret != 0 )
+ return i_ret;
+ }
+ else
+ printf("CPU does not support avx2 simd, skipping test:\n");
+#else
+ printf("avx2 simd asm not built in, skipping test:\n");
+#endif
+
return 0;
}
More information about the vlc-devel
mailing list