[vlc-commits] packetizer: add SSE2 based AnnexB startcode helper
Francois Cartegnie
git at videolan.org
Tue Jan 5 21:42:25 CET 2016
vlc | branch: master | Francois Cartegnie <fcvlcdev at free.fr> | Tue Jan 5 17:39:28 2016 +0100| [90c07084b39082209d778a918a612f29f3a1cd83] | committer: Francois Cartegnie
packetizer: add SSE2 based AnnexB startcode helper
Improves even more from previous commit (by ~2x on 4K)
> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=90c07084b39082209d778a918a612f29f3a1cd83
---
modules/packetizer/startcode_helper.h | 111 ++++++++++++++++++++++++++++-----
1 file changed, 96 insertions(+), 15 deletions(-)
diff --git a/modules/packetizer/startcode_helper.h b/modules/packetizer/startcode_helper.h
index 9db67fd..a126480 100644
--- a/modules/packetizer/startcode_helper.h
+++ b/modules/packetizer/startcode_helper.h
@@ -20,15 +20,105 @@
#ifndef _STARTCODE_HELPER_H
#define _STARTCODE_HELPER_H 1
+#include <vlc_cpu.h>
+
+#if !defined(CAN_COMPILE_SSE2) && defined(HAVE_SSE2_INTRINSICS)
+ #include <emmintrin.h>
+#endif
+
/* Looks up efficiently for an AnnexB startcode 0x00 0x00 0x01
- * by using a 4 times faster trick than single byte lookup.
- *
- * That code is adapted from libav's ff_avc_find_startcode_internal
+ * by using a 4 times faster trick than single byte lookup. */
+
+#define TRY_MATCH(p,a) {\
+ if (p[a+1] == 0) {\
+ if (p[a+0] == 0 && p[a+2] == 1)\
+ return a+p;\
+ if (p[a+2] == 0 && p[a+3] == 1)\
+ return a+p+1;\
+ }\
+ if (p[a+3] == 0) {\
+ if (p[a+2] == 0 && p[a+4] == 1)\
+ return a+p+2;\
+ if (p[a+4] == 0 && p[a+5] == 1)\
+ return a+p+3;\
+ }\
+ }
+
+#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
+
+__attribute__ ((__target__ ("sse2")))
+static inline const uint8_t * startcode_FindAnnexB_SSE2( const uint8_t *p, const uint8_t *end )
+{
+ /* First align to 16 */
+ /* Skipping this step and doing unaligned loads isn't faster */
+ const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15);
+ for (end -= 3; p < alignedend && p < end; p++) {
+ if (p[0] == 0 && p[1] == 0 && p[2] == 1)
+ return p;
+ }
+
+ if( p == end )
+ return NULL;
+
+ alignedend = end - ((intptr_t) end & 15);
+ if( alignedend > p )
+ {
+#ifdef CAN_COMPILE_SSE2
+ asm volatile(
+ "pxor %%xmm1, %%xmm1\n"
+ ::: "xmm1"
+ );
+#else
+ __m128i zeros = _mm_set1_epi8( 0x00 );
+#endif
+ for( ; p < alignedend; p += 16)
+ {
+ uint32_t match;
+#ifdef CAN_COMPILE_SSE2
+ asm volatile(
+ "movdqa 0(%[v]), %%xmm0\n"
+ "pcmpeqb %%xmm1, %%xmm0\n"
+ "pmovmskb %%xmm0, %[match]\n"
+ : [match]"=r"(match)
+ : [v]"r"(p)
+ : "xmm0"
+ );
+#else
+ __m128i v = _mm_load_si128((__m128i*)p);
+ __m128i res = _mm_cmpeq_epi8( zeros, v );
+ match = _mm_movemask_epi8( res ); /* mask will be in reversed match order */
+#endif
+ if( match & 0x000F )
+ TRY_MATCH(p, 0);
+ if( match & 0x00F0 )
+ TRY_MATCH(p, 4);
+ if( match & 0x0F00 )
+ TRY_MATCH(p, 8);
+ if( match & 0xF000 )
+ TRY_MATCH(p, 12);
+ }
+ }
+
+ for (; p < end; p++) {
+ if (p[0] == 0 && p[1] == 0 && p[2] == 1)
+ return p;
+ }
+
+ return NULL;
+}
+
+#endif
+
+/* That code is adapted from libav's ff_avc_find_startcode_internal
* and i believe the trick originated from
* https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
*/
static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint8_t *end )
{
+#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
+ if (vlc_CPU_SSE2())
+ return startcode_FindAnnexB_SSE2(p, end);
+#endif
const uint8_t *a = p + 4 - ((intptr_t)p & 3);
for (end -= 3; p < a && p < end; p++) {
@@ -41,18 +131,7 @@ static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint
if ((x - 0x01010101) & (~x) & 0x80808080)
{
/* matching DW isn't faster */
- if (p[1] == 0) {
- if (p[0] == 0 && p[2] == 1)
- return p;
- if (p[2] == 0 && p[3] == 1)
- return p+1;
- }
- if (p[3] == 0) {
- if (p[2] == 0 && p[4] == 1)
- return p+2;
- if (p[4] == 0 && p[5] == 1)
- return p+3;
- }
+ TRY_MATCH(p, 0);
}
}
@@ -64,4 +143,6 @@ static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint
return NULL;
}
+#undef TRY_MATCH
+
#endif
More information about the vlc-commits
mailing list