[vlc-devel] [PATCH 12/25] packetizer: rewrite startcode_FindAnnexB SSE2 asm
Victorien Le Couviour--Tuffet
victorien.lecouviour.tuffet at gmail.com
Tue Apr 14 12:40:23 CEST 2020
This patch optimizes the SS2 implementation by a 62% speedup.
This measurement comes from the bench_asm tool, over 42 runs on average.
before: 2168.43 cycles
after: 1349.07 cycles
---
modules/codec/Makefile.am | 13 ++-
modules/demux/Makefile.am | 6 +-
modules/mux/Makefile.am | 3 +-
modules/packetizer/Makefile.am | 11 ++-
modules/packetizer/startcode.asm | 95 ++++++++++++++++++++++
modules/packetizer/startcode_helper.h | 112 +++++---------------------
modules/stream_out/Makefile.am | 3 +-
test/Makefile.am | 9 ++-
test/modules/packetizer/helpers.c | 8 +-
9 files changed, 152 insertions(+), 108 deletions(-)
create mode 100644 modules/packetizer/startcode.asm
diff --git a/modules/codec/Makefile.am b/modules/codec/Makefile.am
index fad41d0885..89d097aaf7 100644
--- a/modules/codec/Makefile.am
+++ b/modules/codec/Makefile.am
@@ -344,7 +344,8 @@ libvideotoolbox_plugin_la_SOURCES = \
packetizer/hxxx_sei.h packetizer/hxxx_sei.c \
packetizer/h264_slice.c packetizer/h264_slice.h \
packetizer/h264_nal.c packetizer/h264_nal.h \
- packetizer/hevc_nal.c packetizer/hevc_nal.h
+ packetizer/hevc_nal.c packetizer/hevc_nal.h \
+ packetizer/startcode.asm
libvideotoolbox_plugin_la_CFLAGS = $(AM_CFLAGS)
if HAVE_IOS
libvideotoolbox_plugin_la_CFLAGS += -miphoneos-version-min=9.0
@@ -418,6 +419,7 @@ libdxva2_plugin_la_SOURCES = \
codec/avcodec/va_surface.c codec/avcodec/va_surface.h \
packetizer/h264_nal.c packetizer/h264_nal.h \
packetizer/hevc_nal.c packetizer/hevc_nal.h \
+ packetizer/startcode.asm \
codec/avcodec/dxva_blacklist.c
libdxva2_plugin_la_LIBADD = libd3d9_common.la $(LIBCOM) -lshlwapi -luuid
if HAVE_AVCODEC_DXVA2
@@ -436,6 +438,7 @@ libd3d11va_plugin_la_SOURCES = \
codec/avcodec/va_surface.c codec/avcodec/va_surface.h \
packetizer/h264_nal.c packetizer/h264_nal.h \
packetizer/hevc_nal.c packetizer/hevc_nal.h \
+ packetizer/startcode.asm \
codec/avcodec/dxva_blacklist.c
libd3d11va_plugin_la_LIBADD = libd3d11_common.la $(LIBCOM) -luuid
if HAVE_WINSTORE
@@ -572,7 +575,9 @@ codec_LTLIBRARIES += $(LTLIBdav1d)
### Hardware encoders ###
-libcrystalhd_plugin_la_SOURCES = codec/crystalhd.c packetizer/h264_nal.c packetizer/h264_nal.h
+libcrystalhd_plugin_la_SOURCES = codec/crystalhd.c \
+ packetizer/h264_nal.c packetizer/h264_nal.h \
+ packetizer/startcode.asm
libcrystalhd_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(codecdir)'
libcrystalhd_plugin_la_LIBADD = $(LIBS_crystalhd)
EXTRA_LTLIBRARIES += libcrystalhd_plugin.la
@@ -596,7 +601,9 @@ libdmo_plugin_la_LIBADD += $(LIBCOM) -luuid
codec_LTLIBRARIES += libdmo_plugin.la
endif
-libmft_plugin_la_SOURCES = codec/mft.c packetizer/h264_nal.c packetizer/h264_nal.h
+libmft_plugin_la_SOURCES = codec/mft.c \
+ packetizer/h264_nal.c packetizer/h264_nal.h \
+ packetizer/startcode.asm
if HAVE_WIN32
libmft_plugin_la_LIBADD = $(LIBCOM) -luuid -lmfuuid -lmfplat
codec_LTLIBRARIES += libmft_plugin.la
diff --git a/modules/demux/Makefile.am b/modules/demux/Makefile.am
index 954d2982e0..7d3392b41b 100644
--- a/modules/demux/Makefile.am
+++ b/modules/demux/Makefile.am
@@ -177,7 +177,8 @@ libes_plugin_la_SOURCES = demux/mpeg/es.c \
demux_LTLIBRARIES += libes_plugin.la
libh26x_plugin_la_SOURCES = demux/mpeg/h26x.c \
- packetizer/h264_nal.c packetizer/hevc_nal.h
+ packetizer/h264_nal.c packetizer/hevc_nal.h \
+ packetizer/startcode.asm
libh26x_plugin_la_LIBADD = $(LIBM)
demux_LTLIBRARIES += libh26x_plugin.la
@@ -485,7 +486,8 @@ libadaptive_smooth_SOURCES = \
demux/smooth/SmoothStream.hpp \
demux/smooth/SmoothStream.cpp
libadaptive_smooth_SOURCES += mux/mp4/libmp4mux.c mux/mp4/libmp4mux.h \
- packetizer/h264_nal.c packetizer/hevc_nal.c
+ packetizer/h264_nal.c packetizer/hevc_nal.c \
+ packetizer/startcode.asm
libadaptive_plugin_la_SOURCES += $(libadaptive_hls_SOURCES)
libadaptive_plugin_la_SOURCES += $(libadaptive_dash_SOURCES)
diff --git a/modules/mux/Makefile.am b/modules/mux/Makefile.am
index 581e8a1dd9..7d6eb53dc6 100644
--- a/modules/mux/Makefile.am
+++ b/modules/mux/Makefile.am
@@ -12,7 +12,8 @@ libmux_mp4_plugin_la_SOURCES = mux/mp4/mp4.c \
demux/mp4/libmp4.h mux/av1_pack.h \
packetizer/hxxx_nal.c packetizer/hxxx_nal.h \
packetizer/hevc_nal.c packetizer/hevc_nal.h \
- packetizer/h264_nal.c packetizer/h264_nal.h
+ packetizer/h264_nal.c packetizer/h264_nal.h \
+ packetizer/startcode.asm
libmux_mp4_plugin_la_SOURCES += $(extradata_builder_SOURCES)
libmux_mpjpeg_plugin_la_SOURCES = mux/mpjpeg.c
diff --git a/modules/packetizer/Makefile.am b/modules/packetizer/Makefile.am
index 756cf75fab..2d7d9927a6 100644
--- a/modules/packetizer/Makefile.am
+++ b/modules/packetizer/Makefile.am
@@ -7,9 +7,11 @@ libpacketizer_av1_plugin_la_SOURCES = packetizer/av1.c \
libpacketizer_copy_plugin_la_SOURCES = packetizer/copy.c
libpacketizer_mpegvideo_plugin_la_SOURCES = packetizer/mpegvideo.c \
packetizer/mpegvideo.h \
- packetizer/iso_color_tables.h
+ packetizer/iso_color_tables.h \
+ packetizer/startcode.asm
libpacketizer_mpeg4video_plugin_la_SOURCES = packetizer/mpeg4video.c \
- packetizer/iso_color_tables.h
+ packetizer/iso_color_tables.h \
+ packetizer/startcode.asm
libpacketizer_mjpeg_plugin_la_SOURCES = packetizer/mjpeg.c
libpacketizer_mpeg4audio_plugin_la_SOURCES = packetizer/mpeg4audio.c \
packetizer/mpeg4audio.h
@@ -21,11 +23,13 @@ libpacketizer_h264_plugin_la_SOURCES = \
packetizer/hxxx_sei.c packetizer/hxxx_sei.h \
packetizer/hxxx_common.c packetizer/hxxx_common.h \
packetizer/hxxx_ep3b.h \
+ packetizer/startcode.asm \
packetizer/iso_color_tables.h
libpacketizer_vc1_plugin_la_SOURCES = packetizer/vc1.c \
packetizer/hxxx_ep3b.h \
packetizer/hxxx_nal.h \
- packetizer/iso_color_tables.h
+ packetizer/iso_color_tables.h \
+ packetizer/startcode.asm
libpacketizer_mlp_plugin_la_SOURCES = packetizer/mlp.c
libpacketizer_flac_plugin_la_SOURCES = packetizer/flac.c \
packetizer/flac.h
@@ -35,6 +39,7 @@ libpacketizer_hevc_plugin_la_SOURCES = packetizer/hevc.c \
packetizer/hxxx_nal.h \
packetizer/hxxx_common.c packetizer/hxxx_common.h \
packetizer/hxxx_ep3b.h \
+ packetizer/startcode.asm \
packetizer/iso_color_tables.h
libpacketizer_a52_plugin_la_SOURCES = packetizer/a52.c packetizer/a52.h
libpacketizer_dts_plugin_la_SOURCES = packetizer/dts.c \
diff --git a/modules/packetizer/startcode.asm b/modules/packetizer/startcode.asm
new file mode 100644
index 0000000000..55f294fcb5
--- /dev/null
+++ b/modules/packetizer/startcode.asm
@@ -0,0 +1,95 @@
+;*****************************************************************************
+;* startcode.asm: SIMD optimized startcode helpers
+;*****************************************************************************
+;* Copyright (C) 2019 VideoLAN Authors
+;*
+;* This program is free software; you can redistribute it and/or modify it
+;* under the terms of the GNU Lesser General Public License as published by
+;* the Free Software Foundation; either version 2.1 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public License
+;* along with this program; if not, write to the Free Software Foundation,
+;* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA 16
+
+pd_0x01000000: times 4 dd 0x01000000
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal startcode_FindAnnexB, 2, 7, 6, ptr, end, size
+ mov sizeq, endq
+ sub sizeq, ptrq
+ cmp sized, 16
+ jl .end
+
+ LEA r6q, pd_0x01000000
+ mova m0, [r6q]
+.loop:
+ movu m1, [ptrq]
+ psrldq m2, m1, 1
+ psrldq m3, m1, 2
+ punpckhbw m4, m1, m2 ; 899A ABBC ____ ____
+ punpcklbw m1, m2 ; 0112 2334 4556 6778
+ punpckhbw m5, m2, m3 ; 9AAB BCCD ____ ____
+ punpcklbw m2, m3 ; 1223 3445 5667 7889
+ punpckldq m3, m4, m5 ; 899A 9AAB ABBC BCCD
+ pcmpeqd m1, m0
+ pcmpeqd m2, m0
+ pcmpeqd m3, m0
+ movmskps r3d, m1
+ movmskps r4d, m2
+ movmskps r5d, m3
+ tzcnt r3d, r3d
+ tzcnt r4d, r4d
+ tzcnt r5d, r5d
+ shl r3d, 1
+ lea r4d, [r4d*2+1]
+ add r5d, 8
+ cmp r4d, r3d
+ cmovl r3d, r4d
+ cmp r5d, r3d
+ cmovl r3d, r5d
+ cmp r3d, 32
+ jl .found
+ add ptrq, 12
+ sub sized, 12
+ cmp sized, 16
+ jge .loop
+
+DEFINE_ARGS ptr, _, size, tmp
+.end:
+ sub sized, 3
+ jl .ret_null
+.end_loop:
+ xor tmpd, tmpd
+ test word [ptrq], 0xFFFF
+ cmovz tmpw, [ptrq+1]
+ xor tmpd, 0x0100
+ jz .ret
+ inc ptrq
+ dec sized
+ jge .end_loop
+.ret_null:
+ xor ptrq, ptrq
+.ret:
+%if ARCH_X86_32
+ mov eax, ptrd
+%else
+ mov rax, ptrq
+%endif
+ RET
+
+.found:
+ lea rax, [ptrq+r3q]
+ RET
diff --git a/modules/packetizer/startcode_helper.h b/modules/packetizer/startcode_helper.h
index e6045b9708..4009e06a65 100644
--- a/modules/packetizer/startcode_helper.h
+++ b/modules/packetizer/startcode_helper.h
@@ -23,94 +23,6 @@
#include <vlc_block_helper.h>
#include <vlc_cpu.h>
-#if !defined(CAN_COMPILE_SSE2) && defined(HAVE_SSE2_INTRINSICS)
- #include <emmintrin.h>
-#endif
-
-/* Looks up efficiently for an AnnexB startcode 0x00 0x00 0x01
- * by using a 4 times faster trick than single byte lookup. */
-
-#define TRY_MATCH(p,a) {\
- if (p[a+1] == 0) {\
- if (p[a+0] == 0 && p[a+2] == 1)\
- return a+p;\
- if (p[a+2] == 0 && p[a+3] == 1)\
- return a+p+1;\
- }\
- if (p[a+3] == 0) {\
- if (p[a+2] == 0 && p[a+4] == 1)\
- return a+p+2;\
- if (p[a+4] == 0 && p[a+5] == 1)\
- return a+p+3;\
- }\
- }
-
-#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
-
-__attribute__ ((__target__ ("sse2")))
-static inline uint8_t const *
-startcode_FindAnnexB_SSE2(uint8_t const *p, uint8_t const *end)
-{
- /* First align to 16 */
- /* Skipping this step and doing unaligned loads isn't faster */
- uint8_t const *alignedend = p + 16 - ((intptr_t)p & 15);
- for (end -= 3; p < alignedend && p <= end; p++) {
- if (p[0] == 0 && p[1] == 0 && p[2] == 1)
- return p;
- }
-
- if (p == end)
- return NULL;
-
- alignedend = end - ((intptr_t) end & 15);
- if (alignedend > p)
- {
-#ifdef CAN_COMPILE_SSE2
- asm volatile(
- "pxor %%xmm1, %%xmm1\n"
- ::: "xmm1"
- );
-#else
- __m128i zeros = _mm_set1_epi8(0x00);
-#endif
- for (; p < alignedend; p += 16)
- {
- uint32_t match;
-#ifdef CAN_COMPILE_SSE2
- asm volatile(
- "movdqa 0(%[v]), %%xmm0\n"
- "pcmpeqb %%xmm1, %%xmm0\n"
- "pmovmskb %%xmm0, %[match]\n"
- : [match]"=r"(match)
- : [v]"r"(p)
- : "xmm0"
- );
-#else
- __m128i v = _mm_load_si128((__m128i*)p);
- __m128i res = _mm_cmpeq_epi8( zeros, v );
- match = _mm_movemask_epi8( res ); /* mask will be in reversed match order */
-#endif
- if( match & 0x000F )
- TRY_MATCH(p, 0);
- if( match & 0x00F0 )
- TRY_MATCH(p, 4);
- if( match & 0x0F00 )
- TRY_MATCH(p, 8);
- if( match & 0xF000 )
- TRY_MATCH(p, 12);
- }
- }
-
- for (; p <= end; p++) {
- if (p[0] == 0 && p[1] == 0 && p[2] == 1)
- return p;
- }
-
- return NULL;
-}
-
-#endif
-
/* That code is adapted from libav's ff_avc_find_startcode_internal
* and i believe the trick originated from
* https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
@@ -130,7 +42,20 @@ startcode_FindAnnexB_Bits(uint8_t const *p, uint8_t const *end)
if ((x - 0x01010101) & (~x) & 0x80808080)
{
/* matching DW isn't faster */
- TRY_MATCH(p, 0);
+ /* Looks up efficiently for an AnnexB startcode 0x00 0x00 0x01
+ * by using a 4 times faster trick than single byte lookup. */
+ if (p[1] == 0) {
+ if (p[0] == 0 && p[2] == 1)
+ return p;
+ if (p[2] == 0 && p[3] == 1)
+ return p+1;
+ }
+ if (p[3] == 0) {
+ if (p[2] == 0 && p[4] == 1)
+ return p+2;
+ if (p[4] == 0 && p[5] == 1)
+ return p+3;
+ }
}
}
@@ -141,15 +66,18 @@ startcode_FindAnnexB_Bits(uint8_t const *p, uint8_t const *end)
return NULL;
}
-#undef TRY_MATCH
+#if defined(__i386__) || defined(__x86_64__)
+uint8_t const *vlcpriv_startcode_FindAnnexB_sse2(uint8_t const *ptr,
+ uint8_t const *end);
+#endif
static inline block_startcode_helper_t
startcode_FindAnnexB_helper(void)
{
-#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
+#if defined(__i386__) || defined(__x86_64__)
if (vlc_CPU_SSE2())
- return startcode_FindAnnexB_SSE2;
+ return vlcpriv_startcode_FindAnnexB_sse2;
else
#endif
return startcode_FindAnnexB_Bits;
diff --git a/modules/stream_out/Makefile.am b/modules/stream_out/Makefile.am
index e9a7feccfc..cae2478bf5 100644
--- a/modules/stream_out/Makefile.am
+++ b/modules/stream_out/Makefile.am
@@ -78,7 +78,8 @@ endif
sout_LTLIBRARIES += libstream_out_rtp_plugin.la
libstream_out_rtp_plugin_la_SOURCES = \
stream_out/rtp.c stream_out/rtp.h stream_out/rtpfmt.c \
- stream_out/rtcp.c stream_out/rtsp.c
+ stream_out/rtcp.c stream_out/rtsp.c \
+ packetizer/startcode.asm
libstream_out_rtp_plugin_la_CFLAGS = $(AM_CFLAGS)
libstream_out_rtp_plugin_la_LIBADD = $(SOCKET_LIBS)
if HAVE_GCRYPT
diff --git a/test/Makefile.am b/test/Makefile.am
index 57cc271f6f..eafe46f5e1 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -9,6 +9,11 @@ extra_check_verbose_ = $(extra_check_flags__$(AM_DEFAULT_VERBOSITY))
extra_check_verbose_0 = @echo TEST $@
extra_check_verbose__0 = $(extra_check_verbose_0)
+SUFFIXES = .asm
+
+.asm.o:
+ $(X86ASM) $(X86ASMFLAGS) $(X86ASMDEFS) -I$(top_srcdir)/extras/include/x86/ $< -o $@
+
###############################################################################
# Unit/regression test
###############################################################################
@@ -142,9 +147,9 @@ test_src_interface_dialog_SOURCES = src/interface/dialog.c
test_src_interface_dialog_LDADD = $(LIBVLCCORE) $(LIBVLC)
test_src_media_source_LDADD = $(LIBVLCCORE) $(LIBVLC)
test_src_media_source_SOURCES = src/media_source/media_source.c
-test_modules_packetizer_helpers_SOURCES = modules/packetizer/helpers.c
+test_modules_packetizer_helpers_SOURCES = modules/packetizer/helpers.c $(top_srcdir)/modules/packetizer/startcode.asm
test_modules_packetizer_helpers_LDADD = $(LIBVLCCORE) $(LIBVLC)
-test_modules_packetizer_hxxx_SOURCES = modules/packetizer/hxxx.c
+test_modules_packetizer_hxxx_SOURCES = modules/packetizer/hxxx.c $(top_srcdir)/modules/packetizer/startcode.asm
test_modules_packetizer_hxxx_LDADD = $(LIBVLCCORE) $(LIBVLC)
test_modules_packetizer_h264_SOURCES = modules/packetizer/h264.c \
modules/packetizer/packetizer.h
diff --git a/test/modules/packetizer/helpers.c b/test/modules/packetizer/helpers.c
index 63fd8de4b7..4ee3418ff3 100644
--- a/test/modules/packetizer/helpers.c
+++ b/test/modules/packetizer/helpers.c
@@ -76,13 +76,13 @@ static int run_annexb_sets( const uint8_t *p_set, const uint8_t *p_end,
if( i_ret != 0 )
return i_ret;
- /* Perform same tests on simd optimized code */
-#if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
+ /* Perform same tests on SIMD optimized code */
+#if defined(__i386__) || defined(__x86_64__)
if (vlc_CPU_SSE2())
{
- printf("checking asm:\n");
+ printf("checking SSE2 asm:\n");
i_ret = check_set( p_set, p_end, p_results, i_results, i_results_offset,
- startcode_FindAnnexB_SSE2 );
+ vlcpriv_startcode_FindAnnexB_sse2 );
if( i_ret != 0 )
return i_ret;
}
--
2.24.1
More information about the vlc-devel
mailing list