[vlc-devel] [PATCH 1/1] Subtitles: Encoding detection using uchardet
Salah-Eddin Shaban
salah at videolan.org
Wed Sep 5 09:13:33 CEST 2018
fixes #15173, #17257
---
configure.ac | 18 +++++++++
modules/demux/Makefile.am | 5 +++
modules/demux/subtitle.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 118 insertions(+)
diff --git a/configure.ac b/configure.ac
index 1627c12b79..1de487ce63 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2920,6 +2920,24 @@ AS_IF( [test "${enable_telx}" != "no" ],[
])
dnl
+dnl uchardet detection of subtitle encoding
+dnl
+have_uchardet="no"
+AC_ARG_ENABLE(uchardet,
+ [ --enable-uchardet Subtitle encoding detection using uchardet (default auto)])
+AS_IF( [test "${enable_uchardet}" != "no"], [
+ PKG_CHECK_MODULES([UCHARDET], [uchardet],
+ [
+ have_uchardet="yes"
+ VLC_ADD_CFLAGS([uchardet], [$UCHARDET_CFLAGS])
+ VLC_ADD_LIBS([uchardet], [$UCHARDET_LIBS])
+ ],[
+ AC_MSG_WARN([${UCHARDET_PKG_ERRORS}. Subtitle encoding detection will be disabled])
+ ])
+ ])
+AM_CONDITIONAL([HAVE_UCHARDET], [test "${have_uchardet}" = "yes"])
+
+dnl
dnl ARIB subtitles rendering module
dnl
AC_ARG_ENABLE(aribsub,
diff --git a/modules/demux/Makefile.am b/modules/demux/Makefile.am
index be6ed770fa..eb8f2b8899 100644
--- a/modules/demux/Makefile.am
+++ b/modules/demux/Makefile.am
@@ -61,7 +61,12 @@ libmjpeg_plugin_la_SOURCES = demux/mjpeg.c demux/mxpeg_helper.h
demux_LTLIBRARIES += libmjpeg_plugin.la
libsubtitle_plugin_la_SOURCES = demux/subtitle.c
+libsubtitle_plugin_la_CFLAGS = $(AM_CFLAGS)
libsubtitle_plugin_la_LIBADD = $(LIBM)
+if HAVE_UCHARDET
+libsubtitle_plugin_la_CFLAGS += -DHAVE_UCHARDET $(UCHARDET_CFLAGS)
+libsubtitle_plugin_la_LIBADD += $(UCHARDET_LIBS)
+endif
demux_LTLIBRARIES += libsubtitle_plugin.la
libty_plugin_la_SOURCES = demux/ty.c codec/cc.h
diff --git a/modules/demux/subtitle.c b/modules/demux/subtitle.c
index 9af186edf9..181803104b 100644
--- a/modules/demux/subtitle.c
+++ b/modules/demux/subtitle.c
@@ -42,6 +42,10 @@
#include <vlc_demux.h>
#include <vlc_charset.h>
+#ifdef HAVE_UCHARDET
+# include <uchardet.h>
+#endif
+
/*****************************************************************************
* Module descriptor
*****************************************************************************/
@@ -57,6 +61,8 @@ static void Close( vlc_object_t *p_this );
N_("Force the subtiles format. Selecting \"auto\" means autodetection and should always work.")
#define SUB_DESCRIPTION_LONGTEXT \
N_("Override the default track description.")
+#define SUB_AUTODETECT_LONGTEXT \
+ N_("Detect text encoding of subtitle files.")
static const char *const ppsz_sub_type[] =
{
@@ -83,6 +89,10 @@ vlc_module_begin ()
change_string_list( ppsz_sub_type, ppsz_sub_type )
add_string( "sub-description", NULL, N_("Subtitle description"),
SUB_DESCRIPTION_LONGTEXT, true )
+#ifdef HAVE_UCHARDET
+ add_bool( "sub-detect-encoding", true, N_("Subtitle encoding detection"),
+ SUB_AUTODETECT_LONGTEXT, true )
+#endif
set_callbacks( Open, Close )
add_shortcut( "subtitle" )
@@ -127,6 +137,30 @@ typedef struct
static int TextLoad( text_t *, stream_t *s );
static void TextUnload( text_t * );
+#ifdef HAVE_UCHARDET
+static int TextConvert( text_t const *p_src, text_t *p_dst, char const *psz_encoding )
+{
+ p_dst->line = calloc( p_src->i_line_count, sizeof( *p_dst->line ) );
+ if( !p_dst->line )
+ return VLC_ENOMEM;
+ p_dst->i_line_count = p_src->i_line_count;
+ p_dst->i_line = p_src->i_line;
+
+ for( size_t i = 0; i < p_src->i_line_count; ++i )
+ {
+ char *psz_line = FromCharset( psz_encoding, p_src->line[ i ], strlen( p_src->line[ i ] ) );
+ if( !psz_line )
+ {
+ TextUnload( p_dst );
+ return VLC_EGENERIC;
+ }
+ p_dst->line[ i ] = psz_line;
+ }
+
+ return VLC_SUCCESS;
+}
+#endif
+
typedef struct
{
int64_t i_start;
@@ -648,6 +682,62 @@ static int Open ( vlc_object_t *p_this )
text_t txtlines;
TextLoad( &txtlines, p_demux->s );
+#ifdef HAVE_UCHARDET
+ bool b_utf8 = false;
+
+ if( var_InheritBool( p_demux, "sub-detect-encoding" ) )
+ {
+ if( e_bom == UTF8BOM )
+ b_utf8 = true;
+ else if( txtlines.i_line_count > 0 )
+ {
+ uchardet_t p_handle = uchardet_new();
+
+ if( p_handle )
+ {
+ size_t i = 0;
+ int i_retval = 0;
+
+ do
+ {
+ const char *line = txtlines.line[ i ];
+ size_t i_len = strlen( line );
+ if( i_len > 0 )
+ i_retval = uchardet_handle_data( p_handle, line, i_len );
+ } while( i_retval == 0 && ++i < txtlines.i_line_count );
+
+ if( i_retval == 0 )
+ {
+ uchardet_data_end( p_handle );
+ const char *psz_encoding = uchardet_get_charset( p_handle );
+ if( psz_encoding && *psz_encoding )
+ {
+ msg_Dbg( p_demux, "%s subtitle encoding detected", psz_encoding );
+
+ text_t dst;
+ if( TextConvert( &txtlines, &dst, psz_encoding ) == VLC_SUCCESS )
+ {
+ TextUnload( &txtlines );
+ txtlines = dst;
+ b_utf8 = true;
+ }
+ else
+ msg_Dbg( p_demux, "failed to convert subtitles to UTF-8" );
+ }
+ else
+ msg_Dbg( p_demux, "failed to detect subtitle encoding" );
+ }
+ else
+ msg_Err( p_demux, "error in uchardet_handle_data()" );
+
+ uchardet_delete( p_handle );
+ }
+ else
+ msg_Err( p_demux, "error in uchardet_new()" );
+ }
+ }
+#endif
+
/* Parse it */
for( size_t i_max = 0; i_max < SIZE_MAX - 500 * sizeof(subtitle_t); )
{
@@ -708,6 +798,11 @@ static int Open ( vlc_object_t *p_this )
p_demux->psz_location );
}
+#ifdef HAVE_UCHARDET
+ if( b_utf8 )
+ fmt.subs.psz_encoding = strdup( "UTF-8" );
+#endif
+
char *psz_description = var_InheritString( p_demux, "sub-description" );
if( psz_description && *psz_description )
fmt.psz_description = psz_description;
--
2.13.7
More information about the vlc-devel
mailing list