[vlc-devel] [PATCH 1/1] Subtitles: Encoding detection using uchardet

Salah-Eddin Shaban salah at videolan.org
Wed Sep 5 09:13:33 CEST 2018


fixes #15173, #17257
---
 configure.ac              | 18 +++++++++
 modules/demux/Makefile.am |  5 +++
 modules/demux/subtitle.c  | 95 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 118 insertions(+)

diff --git a/configure.ac b/configure.ac
index 1627c12b79..1de487ce63 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2920,6 +2920,24 @@ AS_IF( [test "${enable_telx}" != "no" ],[
   ])
 
 dnl
+dnl uchardet detection of subtitle encoding
+dnl
+have_uchardet="no"
+AC_ARG_ENABLE(uchardet,
+  [  --enable-uchardet       Subtitle encoding detection using uchardet (default auto)])
+AS_IF( [test "${enable_uchardet}" != "no"], [
+  PKG_CHECK_MODULES([UCHARDET], [uchardet],
+      [
+        have_uchardet="yes"
+        VLC_ADD_CFLAGS([uchardet], [$UCHARDET_CFLAGS])
+        VLC_ADD_LIBS([uchardet], [$UCHARDET_LIBS])
+      ],[
+        AC_MSG_WARN([${UCHARDET_PKG_ERRORS}. Subtitle encoding detection will be disabled])
+      ])
+  ])
+AM_CONDITIONAL([HAVE_UCHARDET], [test "${have_uchardet}" = "yes"])
+
+dnl
 dnl ARIB subtitles rendering module
 dnl
 AC_ARG_ENABLE(aribsub,
diff --git a/modules/demux/Makefile.am b/modules/demux/Makefile.am
index be6ed770fa..eb8f2b8899 100644
--- a/modules/demux/Makefile.am
+++ b/modules/demux/Makefile.am
@@ -61,7 +61,12 @@ libmjpeg_plugin_la_SOURCES = demux/mjpeg.c demux/mxpeg_helper.h
 demux_LTLIBRARIES += libmjpeg_plugin.la
 
 libsubtitle_plugin_la_SOURCES = demux/subtitle.c
+libsubtitle_plugin_la_CFLAGS = $(AM_CFLAGS)
 libsubtitle_plugin_la_LIBADD = $(LIBM)
+if HAVE_UCHARDET
+libsubtitle_plugin_la_CFLAGS += -DHAVE_UCHARDET $(UCHARDET_CFLAGS)
+libsubtitle_plugin_la_LIBADD += $(UCHARDET_LIBS)
+endif
 demux_LTLIBRARIES += libsubtitle_plugin.la
 
 libty_plugin_la_SOURCES = demux/ty.c codec/cc.h
diff --git a/modules/demux/subtitle.c b/modules/demux/subtitle.c
index 9af186edf9..181803104b 100644
--- a/modules/demux/subtitle.c
+++ b/modules/demux/subtitle.c
@@ -42,6 +42,10 @@
 #include <vlc_demux.h>
 #include <vlc_charset.h>
 
+#ifdef HAVE_UCHARDET
+# include <uchardet.h>
+#endif
+
 /*****************************************************************************
  * Module descriptor
  *****************************************************************************/
@@ -57,6 +61,8 @@ static void Close( vlc_object_t *p_this );
     N_("Force the subtiles format. Selecting \"auto\" means autodetection and should always work.")
 #define SUB_DESCRIPTION_LONGTEXT \
     N_("Override the default track description.")
+#define SUB_AUTODETECT_LONGTEXT \
+    N_("Detect text encoding of subtitle files.")
 
 static const char *const ppsz_sub_type[] =
 {
@@ -83,6 +89,10 @@ vlc_module_begin ()
         change_string_list( ppsz_sub_type, ppsz_sub_type )
     add_string( "sub-description", NULL, N_("Subtitle description"),
                 SUB_DESCRIPTION_LONGTEXT, true )
+#ifdef HAVE_UCHARDET
+    add_bool( "sub-detect-encoding", true, N_("Subtitle encoding detection"),
+              SUB_AUTODETECT_LONGTEXT, true )
+#endif
     set_callbacks( Open, Close )
 
     add_shortcut( "subtitle" )
@@ -127,6 +137,30 @@ typedef struct
 static int  TextLoad( text_t *, stream_t *s );
 static void TextUnload( text_t * );
 
+#ifdef HAVE_UCHARDET
+static int TextConvert( text_t const *p_src, text_t *p_dst, char const *psz_encoding )
+{
+    p_dst->line           = calloc( p_src->i_line_count, sizeof( *p_dst->line ) );
+    if( !p_dst->line )
+        return VLC_ENOMEM;
+    p_dst->i_line_count   = p_src->i_line_count;
+    p_dst->i_line         = p_src->i_line;
+
+    for( size_t i = 0; i < p_src->i_line_count; ++i )
+    {
+        char *psz_line = FromCharset( psz_encoding, p_src->line[ i ], strlen( p_src->line[ i ] ) );
+        if( !psz_line )
+        {
+            TextUnload( p_dst );
+            return VLC_EGENERIC;
+        }
+        p_dst->line[ i ] = psz_line;
+    }
+
+    return VLC_SUCCESS;
+}
+#endif
+
 typedef struct
 {
     int64_t i_start;
@@ -648,6 +682,62 @@ static int Open ( vlc_object_t *p_this )
     text_t txtlines;
     TextLoad( &txtlines, p_demux->s );
 
+#ifdef HAVE_UCHARDET
+    bool b_utf8 = false;
+
+    if( var_InheritBool( p_demux, "sub-detect-encoding" ) )
+    {
+        if( e_bom == UTF8BOM )
+            b_utf8 = true;
+        else if( txtlines.i_line_count > 0 )
+        {
+            uchardet_t  p_handle = uchardet_new();
+
+            if( p_handle )
+            {
+                size_t i = 0;
+                int i_retval = 0;
+
+                do
+                {
+                    const char *line  = txtlines.line[ i ];
+                    size_t      i_len = strlen( line );
+                    if( i_len > 0 )
+                        i_retval = uchardet_handle_data( p_handle, line, i_len );
+                } while( i_retval == 0 && ++i < txtlines.i_line_count );
+
+                if( i_retval == 0 )
+                {
+                    uchardet_data_end( p_handle );
+                    const char *psz_encoding = uchardet_get_charset( p_handle );
+                    if( psz_encoding && *psz_encoding )
+                    {
+                        msg_Dbg( p_demux, "%s subtitle encoding detected", psz_encoding );
+
+                        text_t dst;
+                        if( TextConvert( &txtlines, &dst, psz_encoding ) == VLC_SUCCESS )
+                        {
+                            TextUnload( &txtlines );
+                            txtlines = dst;
+                            b_utf8 = true;
+                        }
+                        else
+                            msg_Dbg( p_demux, "failed to convert subtitles to UTF-8" );
+                    }
+                    else
+                        msg_Dbg( p_demux, "failed to detect subtitle encoding" );
+                }
+                else
+                    msg_Err( p_demux, "error in uchardet_handle_data()" );
+
+                uchardet_delete( p_handle );
+            }
+            else
+                msg_Err( p_demux, "error in uchardet_new()" );
+        }
+    }
+#endif
+
     /* Parse it */
     for( size_t i_max = 0; i_max < SIZE_MAX - 500 * sizeof(subtitle_t); )
     {
@@ -708,6 +798,11 @@ static int Open ( vlc_object_t *p_this )
                  p_demux->psz_location );
     }
 
+#ifdef HAVE_UCHARDET
+    if( b_utf8 )
+        fmt.subs.psz_encoding = strdup( "UTF-8" );
+#endif
+
     char *psz_description = var_InheritString( p_demux, "sub-description" );
     if( psz_description && *psz_description )
         fmt.psz_description = psz_description;
-- 
2.13.7



More information about the vlc-devel mailing list