[vlc-devel] [PATCH 12/25] packetizer: rewrite startcode_FindAnnexB SSE2 asm

Victorien Le Couviour--Tuffet victorien.lecouviour.tuffet at gmail.com
Tue Apr 14 12:40:23 CEST 2020


This patch optimizes the SS2 implementation by a 62% speedup.
This measurement comes from the bench_asm tool, over 42 runs on average.

before: 2168.43 cycles
 after: 1349.07 cycles
---
 modules/codec/Makefile.am             |  13 ++-
 modules/demux/Makefile.am             |   6 +-
 modules/mux/Makefile.am               |   3 +-
 modules/packetizer/Makefile.am        |  11 ++-
 modules/packetizer/startcode.asm      |  95 ++++++++++++++++++++++
 modules/packetizer/startcode_helper.h | 112 +++++---------------------
 modules/stream_out/Makefile.am        |   3 +-
 test/Makefile.am                      |   9 ++-
 test/modules/packetizer/helpers.c     |   8 +-
 9 files changed, 152 insertions(+), 108 deletions(-)
 create mode 100644 modules/packetizer/startcode.asm

diff --git a/modules/codec/Makefile.am b/modules/codec/Makefile.am
index fad41d0885..89d097aaf7 100644
--- a/modules/codec/Makefile.am
+++ b/modules/codec/Makefile.am
@@ -344,7 +344,8 @@ libvideotoolbox_plugin_la_SOURCES = \
 	packetizer/hxxx_sei.h packetizer/hxxx_sei.c \
 	packetizer/h264_slice.c packetizer/h264_slice.h \
 	packetizer/h264_nal.c packetizer/h264_nal.h \
-	packetizer/hevc_nal.c packetizer/hevc_nal.h
+	packetizer/hevc_nal.c packetizer/hevc_nal.h \
+	packetizer/startcode.asm
 libvideotoolbox_plugin_la_CFLAGS = $(AM_CFLAGS)
 if HAVE_IOS
 libvideotoolbox_plugin_la_CFLAGS += -miphoneos-version-min=9.0
@@ -418,6 +419,7 @@ libdxva2_plugin_la_SOURCES = \
 	codec/avcodec/va_surface.c codec/avcodec/va_surface.h \
 	packetizer/h264_nal.c packetizer/h264_nal.h \
 	packetizer/hevc_nal.c packetizer/hevc_nal.h \
+	packetizer/startcode.asm \
 	codec/avcodec/dxva_blacklist.c
 libdxva2_plugin_la_LIBADD = libd3d9_common.la $(LIBCOM) -lshlwapi -luuid
 if HAVE_AVCODEC_DXVA2
@@ -436,6 +438,7 @@ libd3d11va_plugin_la_SOURCES = \
         codec/avcodec/va_surface.c codec/avcodec/va_surface.h \
 	packetizer/h264_nal.c packetizer/h264_nal.h \
 	packetizer/hevc_nal.c packetizer/hevc_nal.h \
+	packetizer/startcode.asm \
 	codec/avcodec/dxva_blacklist.c
 libd3d11va_plugin_la_LIBADD = libd3d11_common.la $(LIBCOM) -luuid
 if HAVE_WINSTORE
@@ -572,7 +575,9 @@ codec_LTLIBRARIES += $(LTLIBdav1d)
 
 ### Hardware encoders ###
 
-libcrystalhd_plugin_la_SOURCES = codec/crystalhd.c packetizer/h264_nal.c packetizer/h264_nal.h
+libcrystalhd_plugin_la_SOURCES = codec/crystalhd.c \
+				 packetizer/h264_nal.c packetizer/h264_nal.h \
+				 packetizer/startcode.asm
 libcrystalhd_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(codecdir)'
 libcrystalhd_plugin_la_LIBADD = $(LIBS_crystalhd)
 EXTRA_LTLIBRARIES += libcrystalhd_plugin.la
@@ -596,7 +601,9 @@ libdmo_plugin_la_LIBADD += $(LIBCOM) -luuid
 codec_LTLIBRARIES += libdmo_plugin.la
 endif
 
-libmft_plugin_la_SOURCES = codec/mft.c packetizer/h264_nal.c packetizer/h264_nal.h
+libmft_plugin_la_SOURCES = codec/mft.c \
+			   packetizer/h264_nal.c packetizer/h264_nal.h \
+			   packetizer/startcode.asm
 if HAVE_WIN32
 libmft_plugin_la_LIBADD = $(LIBCOM) -luuid -lmfuuid -lmfplat
 codec_LTLIBRARIES += libmft_plugin.la
diff --git a/modules/demux/Makefile.am b/modules/demux/Makefile.am
index 954d2982e0..7d3392b41b 100644
--- a/modules/demux/Makefile.am
+++ b/modules/demux/Makefile.am
@@ -177,7 +177,8 @@ libes_plugin_la_SOURCES  = demux/mpeg/es.c \
 demux_LTLIBRARIES += libes_plugin.la
 
 libh26x_plugin_la_SOURCES = demux/mpeg/h26x.c \
-                            packetizer/h264_nal.c packetizer/hevc_nal.h
+                            packetizer/h264_nal.c packetizer/hevc_nal.h \
+			    packetizer/startcode.asm
 libh26x_plugin_la_LIBADD = $(LIBM)
 demux_LTLIBRARIES += libh26x_plugin.la
 
@@ -485,7 +486,8 @@ libadaptive_smooth_SOURCES = \
     demux/smooth/SmoothStream.hpp \
     demux/smooth/SmoothStream.cpp
 libadaptive_smooth_SOURCES += mux/mp4/libmp4mux.c mux/mp4/libmp4mux.h \
-			      packetizer/h264_nal.c packetizer/hevc_nal.c
+			      packetizer/h264_nal.c packetizer/hevc_nal.c \
+			      packetizer/startcode.asm
 
 libadaptive_plugin_la_SOURCES += $(libadaptive_hls_SOURCES)
 libadaptive_plugin_la_SOURCES += $(libadaptive_dash_SOURCES)
diff --git a/modules/mux/Makefile.am b/modules/mux/Makefile.am
index 581e8a1dd9..7d6eb53dc6 100644
--- a/modules/mux/Makefile.am
+++ b/modules/mux/Makefile.am
@@ -12,7 +12,8 @@ libmux_mp4_plugin_la_SOURCES = mux/mp4/mp4.c \
         demux/mp4/libmp4.h mux/av1_pack.h \
 	packetizer/hxxx_nal.c packetizer/hxxx_nal.h \
         packetizer/hevc_nal.c packetizer/hevc_nal.h \
-        packetizer/h264_nal.c packetizer/h264_nal.h
+        packetizer/h264_nal.c packetizer/h264_nal.h \
+	packetizer/startcode.asm
 libmux_mp4_plugin_la_SOURCES += $(extradata_builder_SOURCES)
 
 libmux_mpjpeg_plugin_la_SOURCES = mux/mpjpeg.c
diff --git a/modules/packetizer/Makefile.am b/modules/packetizer/Makefile.am
index 756cf75fab..2d7d9927a6 100644
--- a/modules/packetizer/Makefile.am
+++ b/modules/packetizer/Makefile.am
@@ -7,9 +7,11 @@ libpacketizer_av1_plugin_la_SOURCES = packetizer/av1.c \
 libpacketizer_copy_plugin_la_SOURCES = packetizer/copy.c
 libpacketizer_mpegvideo_plugin_la_SOURCES = packetizer/mpegvideo.c \
                                             packetizer/mpegvideo.h \
-                                            packetizer/iso_color_tables.h
+                                            packetizer/iso_color_tables.h \
+                                            packetizer/startcode.asm
 libpacketizer_mpeg4video_plugin_la_SOURCES = packetizer/mpeg4video.c \
-                                             packetizer/iso_color_tables.h
+                                             packetizer/iso_color_tables.h \
+                                             packetizer/startcode.asm
 libpacketizer_mjpeg_plugin_la_SOURCES = packetizer/mjpeg.c
 libpacketizer_mpeg4audio_plugin_la_SOURCES = packetizer/mpeg4audio.c \
                                              packetizer/mpeg4audio.h
@@ -21,11 +23,13 @@ libpacketizer_h264_plugin_la_SOURCES = \
 	packetizer/hxxx_sei.c packetizer/hxxx_sei.h \
 	packetizer/hxxx_common.c packetizer/hxxx_common.h \
         packetizer/hxxx_ep3b.h \
+        packetizer/startcode.asm \
         packetizer/iso_color_tables.h
 libpacketizer_vc1_plugin_la_SOURCES = packetizer/vc1.c \
         packetizer/hxxx_ep3b.h \
         packetizer/hxxx_nal.h \
-        packetizer/iso_color_tables.h
+        packetizer/iso_color_tables.h \
+        packetizer/startcode.asm
 libpacketizer_mlp_plugin_la_SOURCES = packetizer/mlp.c
 libpacketizer_flac_plugin_la_SOURCES = packetizer/flac.c \
         packetizer/flac.h
@@ -35,6 +39,7 @@ libpacketizer_hevc_plugin_la_SOURCES = packetizer/hevc.c \
 	packetizer/hxxx_nal.h \
 	packetizer/hxxx_common.c packetizer/hxxx_common.h \
         packetizer/hxxx_ep3b.h \
+        packetizer/startcode.asm \
         packetizer/iso_color_tables.h
 libpacketizer_a52_plugin_la_SOURCES = packetizer/a52.c packetizer/a52.h
 libpacketizer_dts_plugin_la_SOURCES = packetizer/dts.c \
diff --git a/modules/packetizer/startcode.asm b/modules/packetizer/startcode.asm
new file mode 100644
index 0000000000..55f294fcb5
--- /dev/null
+++ b/modules/packetizer/startcode.asm
@@ -0,0 +1,95 @@
+;*****************************************************************************
+;* startcode.asm: SIMD optimized startcode helpers
+;*****************************************************************************
+;* Copyright (C) 2019 VideoLAN Authors
+;*
+;* This program is free software; you can redistribute it and/or modify it
+;* under the terms of the GNU Lesser General Public License as published by
+;* the Free Software Foundation; either version 2.1 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public License
+;* along with this program; if not, write to the Free Software Foundation,
+;* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA 16
+
+pd_0x01000000: times 4 dd 0x01000000
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal startcode_FindAnnexB, 2, 7, 6, ptr, end, size
+    mov          sizeq, endq
+    sub          sizeq, ptrq
+    cmp          sized, 16
+    jl .end
+
+    LEA            r6q, pd_0x01000000
+    mova            m0, [r6q]
+.loop:
+    movu            m1, [ptrq]
+    psrldq          m2, m1, 1
+    psrldq          m3, m1, 2
+    punpckhbw       m4, m1, m2  ; 899A ABBC ____ ____
+    punpcklbw       m1, m2      ; 0112 2334 4556 6778
+    punpckhbw       m5, m2, m3  ; 9AAB BCCD ____ ____
+    punpcklbw       m2, m3      ; 1223 3445 5667 7889
+    punpckldq       m3, m4, m5  ; 899A 9AAB ABBC BCCD
+    pcmpeqd         m1, m0
+    pcmpeqd         m2, m0
+    pcmpeqd         m3, m0
+    movmskps       r3d, m1
+    movmskps       r4d, m2
+    movmskps       r5d, m3
+    tzcnt          r3d, r3d
+    tzcnt          r4d, r4d
+    tzcnt          r5d, r5d
+    shl            r3d, 1
+    lea            r4d, [r4d*2+1]
+    add            r5d, 8
+    cmp            r4d, r3d
+    cmovl          r3d, r4d
+    cmp            r5d, r3d
+    cmovl          r3d, r5d
+    cmp            r3d, 32
+    jl .found
+    add           ptrq, 12
+    sub          sized, 12
+    cmp          sized, 16
+    jge .loop
+
+DEFINE_ARGS ptr, _, size, tmp
+.end:
+    sub          sized, 3
+    jl .ret_null
+.end_loop:
+    xor           tmpd, tmpd
+    test   word [ptrq], 0xFFFF
+    cmovz         tmpw, [ptrq+1]
+    xor           tmpd, 0x0100
+    jz .ret
+    inc           ptrq
+    dec          sized
+    jge .end_loop
+.ret_null:
+    xor           ptrq, ptrq
+.ret:
+%if ARCH_X86_32
+    mov            eax, ptrd
+%else
+    mov            rax, ptrq
+%endif
+    RET
+
+.found:
+    lea            rax, [ptrq+r3q]
+    RET
diff --git a/modules/packetizer/startcode_helper.h b/modules/packetizer/startcode_helper.h
index e6045b9708..4009e06a65 100644
--- a/modules/packetizer/startcode_helper.h
+++ b/modules/packetizer/startcode_helper.h
@@ -23,94 +23,6 @@
 #include <vlc_block_helper.h>
 #include <vlc_cpu.h>
 
-#if !defined(CAN_COMPILE_SSE2) && defined(HAVE_SSE2_INTRINSICS)
-   #include <emmintrin.h>
-#endif
-
-/* Looks up efficiently for an AnnexB startcode 0x00 0x00 0x01
- * by using a 4 times faster trick than single byte lookup. */
-
-#define TRY_MATCH(p,a) {\
-     if (p[a+1] == 0) {\
-            if (p[a+0] == 0 && p[a+2] == 1)\
-                return a+p;\
-            if (p[a+2] == 0 && p[a+3] == 1)\
-                return a+p+1;\
-        }\
-        if (p[a+3] == 0) {\
-            if (p[a+2] == 0 && p[a+4] == 1)\
-                return a+p+2;\
-            if (p[a+4] == 0 && p[a+5] == 1)\
-                return a+p+3;\
-        }\
-    }
-
-#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
-
-__attribute__ ((__target__ ("sse2")))
-static inline uint8_t const *
-startcode_FindAnnexB_SSE2(uint8_t const *p, uint8_t const *end)
-{
-    /* First align to 16 */
-    /* Skipping this step and doing unaligned loads isn't faster */
-    uint8_t const *alignedend = p + 16 - ((intptr_t)p & 15);
-    for (end -= 3; p < alignedend && p <= end; p++) {
-        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
-            return p;
-    }
-
-    if (p == end)
-        return NULL;
-
-    alignedend = end - ((intptr_t) end & 15);
-    if (alignedend > p)
-    {
-#ifdef CAN_COMPILE_SSE2
-        asm volatile(
-            "pxor   %%xmm1, %%xmm1\n"
-            ::: "xmm1"
-        );
-#else
-        __m128i zeros = _mm_set1_epi8(0x00);
-#endif
-        for (; p < alignedend; p += 16)
-        {
-            uint32_t match;
-#ifdef CAN_COMPILE_SSE2
-            asm volatile(
-                "movdqa   0(%[v]),   %%xmm0\n"
-                "pcmpeqb   %%xmm1,   %%xmm0\n"
-                "pmovmskb  %%xmm0,   %[match]\n"
-                : [match]"=r"(match)
-                : [v]"r"(p)
-                : "xmm0"
-            );
-#else
-            __m128i v = _mm_load_si128((__m128i*)p);
-            __m128i res = _mm_cmpeq_epi8( zeros, v );
-            match = _mm_movemask_epi8( res ); /* mask will be in reversed match order */
-#endif
-            if( match & 0x000F )
-                TRY_MATCH(p, 0);
-            if( match & 0x00F0 )
-                TRY_MATCH(p, 4);
-            if( match & 0x0F00 )
-                TRY_MATCH(p, 8);
-            if( match & 0xF000 )
-                TRY_MATCH(p, 12);
-        }
-    }
-
-    for (; p <= end; p++) {
-        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
-            return p;
-    }
-
-    return NULL;
-}
-
-#endif
-
 /* That code is adapted from libav's ff_avc_find_startcode_internal
  * and i believe the trick originated from
  * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
@@ -130,7 +42,20 @@ startcode_FindAnnexB_Bits(uint8_t const *p, uint8_t const *end)
         if ((x - 0x01010101) & (~x) & 0x80808080)
         {
             /* matching DW isn't faster */
-            TRY_MATCH(p, 0);
+            /* Looks up efficiently for an AnnexB startcode 0x00 0x00 0x01
+             * by using a 4 times faster trick than single byte lookup. */
+            if (p[1] == 0) {
+                if (p[0] == 0 && p[2] == 1)
+                    return p;
+                if (p[2] == 0 && p[3] == 1)
+                    return p+1;
+            }
+            if (p[3] == 0) {
+                if (p[2] == 0 && p[4] == 1)
+                    return p+2;
+                if (p[4] == 0 && p[5] == 1)
+                    return p+3;
+            }
         }
     }
 
@@ -141,15 +66,18 @@ startcode_FindAnnexB_Bits(uint8_t const *p, uint8_t const *end)
 
     return NULL;
 }
-#undef TRY_MATCH
 
+#if defined(__i386__) || defined(__x86_64__)
+uint8_t const *vlcpriv_startcode_FindAnnexB_sse2(uint8_t const *ptr,
+                                                 uint8_t const *end);
+#endif
 
 static inline block_startcode_helper_t
 startcode_FindAnnexB_helper(void)
 {
-#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
+#if defined(__i386__) || defined(__x86_64__)
     if (vlc_CPU_SSE2())
-        return startcode_FindAnnexB_SSE2;
+        return vlcpriv_startcode_FindAnnexB_sse2;
     else
 #endif
         return startcode_FindAnnexB_Bits;
diff --git a/modules/stream_out/Makefile.am b/modules/stream_out/Makefile.am
index e9a7feccfc..cae2478bf5 100644
--- a/modules/stream_out/Makefile.am
+++ b/modules/stream_out/Makefile.am
@@ -78,7 +78,8 @@ endif
 sout_LTLIBRARIES += libstream_out_rtp_plugin.la
 libstream_out_rtp_plugin_la_SOURCES = \
 	stream_out/rtp.c stream_out/rtp.h stream_out/rtpfmt.c \
-	stream_out/rtcp.c stream_out/rtsp.c
+	stream_out/rtcp.c stream_out/rtsp.c \
+	packetizer/startcode.asm
 libstream_out_rtp_plugin_la_CFLAGS = $(AM_CFLAGS)
 libstream_out_rtp_plugin_la_LIBADD = $(SOCKET_LIBS)
 if HAVE_GCRYPT
diff --git a/test/Makefile.am b/test/Makefile.am
index 57cc271f6f..eafe46f5e1 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -9,6 +9,11 @@ extra_check_verbose_ = $(extra_check_flags__$(AM_DEFAULT_VERBOSITY))
 extra_check_verbose_0 = @echo TEST $@
 extra_check_verbose__0 = $(extra_check_verbose_0)
 
+SUFFIXES = .asm
+
+.asm.o:
+	$(X86ASM) $(X86ASMFLAGS) $(X86ASMDEFS) -I$(top_srcdir)/extras/include/x86/ $< -o $@
+
 ###############################################################################
 # Unit/regression test
 ###############################################################################
@@ -142,9 +147,9 @@ test_src_interface_dialog_SOURCES = src/interface/dialog.c
 test_src_interface_dialog_LDADD = $(LIBVLCCORE) $(LIBVLC)
 test_src_media_source_LDADD = $(LIBVLCCORE) $(LIBVLC)
 test_src_media_source_SOURCES = src/media_source/media_source.c
-test_modules_packetizer_helpers_SOURCES = modules/packetizer/helpers.c
+test_modules_packetizer_helpers_SOURCES = modules/packetizer/helpers.c $(top_srcdir)/modules/packetizer/startcode.asm
 test_modules_packetizer_helpers_LDADD = $(LIBVLCCORE) $(LIBVLC)
-test_modules_packetizer_hxxx_SOURCES = modules/packetizer/hxxx.c
+test_modules_packetizer_hxxx_SOURCES = modules/packetizer/hxxx.c $(top_srcdir)/modules/packetizer/startcode.asm
 test_modules_packetizer_hxxx_LDADD = $(LIBVLCCORE) $(LIBVLC)
 test_modules_packetizer_h264_SOURCES = modules/packetizer/h264.c \
 				modules/packetizer/packetizer.h
diff --git a/test/modules/packetizer/helpers.c b/test/modules/packetizer/helpers.c
index 63fd8de4b7..4ee3418ff3 100644
--- a/test/modules/packetizer/helpers.c
+++ b/test/modules/packetizer/helpers.c
@@ -76,13 +76,13 @@ static int run_annexb_sets( const uint8_t *p_set, const uint8_t *p_end,
     if( i_ret != 0 )
         return i_ret;
 
-    /* Perform same tests on simd optimized code */
-#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
+    /* Perform same tests on SIMD optimized code */
+#if defined(__i386__) || defined(__x86_64__)
     if (vlc_CPU_SSE2())
     {
-        printf("checking asm:\n");
+        printf("checking SSE2 asm:\n");
         i_ret = check_set( p_set, p_end, p_results, i_results, i_results_offset,
-                           startcode_FindAnnexB_SSE2 );
+                           vlcpriv_startcode_FindAnnexB_sse2 );
         if( i_ret != 0 )
             return i_ret;
     }
-- 
2.24.1



More information about the vlc-devel mailing list