[vlc-devel] [PATCH 5/19] packetizer/startcode_helper: enhance with AVX2

Lyndon Brown jnqnfe at gmail.com
Thu Sep 24 21:36:14 CEST 2020


From: Lyndon Brown <jnqnfe at gmail.com>
Date: Fri, 25 Jan 2019 06:05:54 +0000
Subject: packetizer/startcode_helper: enhance with AVX2


diff --git a/modules/packetizer/startcode_helper.h b/modules/packetizer/startcode_helper.h
index 41ba9b2fbf..bd19855a48 100644
--- a/modules/packetizer/startcode_helper.h
+++ b/modules/packetizer/startcode_helper.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
  * startcode_helper.h: Startcodes helpers
  *****************************************************************************
- * Copyright (C) 2016, 2020 VideoLAN Authors
+ * Copyright (C) 2016, 2019, 2020 VideoLAN Authors
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU Lesser General Public License as published by
@@ -22,7 +22,7 @@
 
 #include <vlc_cpu.h>
 
-#ifdef CAN_COMPILE_SSE2
+#if defined(CAN_COMPILE_SSE2) || defined(CAN_COMPILE_AVX2)
 #  ifdef __has_attribute
 #    if __has_attribute(__vector_size__)
 #      define HAS_ATTRIBUTE_VECTORSIZE
@@ -30,7 +30,12 @@
 #  endif
 
 #  ifdef HAS_ATTRIBUTE_VECTORSIZE
-    typedef unsigned char v16qu __attribute__((__vector_size__(16)));
+#    ifdef CAN_COMPILE_SSE2
+       typedef unsigned char v16qu __attribute__((__vector_size__(16)));
+#    endif
+#    ifdef CAN_COMPILE_AVX2
+       typedef unsigned char v32qu __attribute__((__vector_size__(32)));
+#    endif
 #  endif
 #endif
 
@@ -52,6 +57,80 @@
         }\
     }
 
+#ifdef CAN_COMPILE_AVX2
+
+__attribute__ ((__target__ ("avx2")))
+static inline const uint8_t * startcode_FindAnnexB_AVX2( const uint8_t *p, const uint8_t *end )
+{
+    /* First align to 32 */
+    const uint8_t *alignedend = p + 32 - ((intptr_t)p & 31);
+    for (end -= 3; p < alignedend && p <= end; p++) {
+        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
+            return p;
+    }
+
+    if( p == end )
+        return NULL;
+
+    alignedend = end - ((intptr_t) end & 31);
+    if( alignedend > p )
+    {
+#  ifdef HAS_ATTRIBUTE_VECTORSIZE
+        const v32qu zeros = { 0 };
+#  endif
+
+        for( ; p < alignedend; p += 32)
+        {
+            uint32_t match;
+#  ifdef HAS_ATTRIBUTE_VECTORSIZE
+            __asm__ volatile(
+                "vmovdqa    0(%[v]),   %%ymm0\n"
+                "vpcmpeqb   %[czero],  %%ymm0, %%ymm0\n"
+                "vpmovmskb  %%ymm0,    %[match]\n" /* mask will be in reversed match order */
+                : [match]"=r"(match)
+                : [v]"r"(p), [czero]"x"(zeros)
+                : "ymm0"
+            );
+#  else
+            __asm__ volatile(
+                "vmovdqa    0(%[v]),  %%ymm0\n"
+                "vpxor      %%ymm1,   %%ymm1, %%ymm1\n"
+                "vpcmpeqb   %%ymm1,   %%ymm0, %%ymm0\n"
+                "vpmovmskb  %%ymm0,   %[match]\n" /* mask will be in reversed match order */
+                : [match]"=r"(match)
+                : [v]"r"(p)
+                : "ymm0", "ymm1"
+            );
+#  endif
+            if( match & 0x0000000F )
+                TRY_MATCH(p, 0);
+            if( match & 0x000000F0 )
+                TRY_MATCH(p, 4);
+            if( match & 0x00000F00 )
+                TRY_MATCH(p, 8);
+            if( match & 0x0000F000 )
+                TRY_MATCH(p, 12);
+            if( match & 0x000F0000 )
+                TRY_MATCH(p, 16);
+            if( match & 0x00F00000 )
+                TRY_MATCH(p, 20);
+            if( match & 0x0F000000 )
+                TRY_MATCH(p, 24);
+            if( match & 0xF0000000 )
+                TRY_MATCH(p, 28);
+        }
+    }
+
+    for (; p <= end; p++) {
+        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
+            return p;
+    }
+
+    return NULL;
+}
+
+#endif
+
 #ifdef CAN_COMPILE_SSE2
 
 __attribute__ ((__target__ ("sse2")))
@@ -152,6 +231,10 @@ static inline const uint8_t * startcode_FindAnnexB_Bits( const uint8_t *p, const
 
 static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint8_t *end )
 {
+#ifdef CAN_COMPILE_AVX2
+    if (vlc_CPU_AVX2())
+        return startcode_FindAnnexB_AVX2(p, end);
+#endif
 #ifdef CAN_COMPILE_SSE2
     if (vlc_CPU_SSE2())
         return startcode_FindAnnexB_SSE2(p, end);
diff --git a/test/modules/packetizer/helpers.c b/test/modules/packetizer/helpers.c
index 475644d2ad..957298d630 100644
--- a/test/modules/packetizer/helpers.c
+++ b/test/modules/packetizer/helpers.c
@@ -93,6 +93,21 @@ static int run_annexb_sets( const uint8_t *p_set, const uint8_t *p_end,
     printf("sse2 simd asm not built in, skipping test:\n");
 #endif
 
+#ifdef CAN_COMPILE_AVX2
+    if (vlc_CPU_AVX2())
+    {
+        printf("checking avx2 simd asm:\n");
+        i_ret = check_set( p_set, p_end, p_results, i_results, i_results_offset,
+                           startcode_FindAnnexB_AVX2 );
+        if( i_ret != 0 )
+            return i_ret;
+    }
+    else
+        printf("CPU does not support avx2 simd, skipping test:\n");
+#else
+    printf("avx2 simd asm not built in, skipping test:\n");
+#endif
+
     return 0;
 }
 



More information about the vlc-devel mailing list