[vlc-devel] [PATCH 2/3] Detect subtitles charset using uchardet

pertuleha at gmail.com pertuleha at gmail.com
Sun Apr 7 20:38:25 CEST 2019


From: Aleksei Pertu <pertuleha at gmail.com>

---
 modules/demux/Makefile.am |   8 ++-
 modules/demux/subtitle.c  | 104 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 109 insertions(+), 3 deletions(-)

diff --git a/modules/demux/Makefile.am b/modules/demux/Makefile.am
index 85d4b3fba9..6802c4909f 100644
--- a/modules/demux/Makefile.am
+++ b/modules/demux/Makefile.am
@@ -61,7 +61,13 @@ libmjpeg_plugin_la_SOURCES = demux/mjpeg.c demux/mxpeg_helper.h
 demux_LTLIBRARIES += libmjpeg_plugin.la
 
 libsubtitle_plugin_la_SOURCES = demux/subtitle.c
-libsubtitle_plugin_la_LIBADD = $(LIBM)
+if HAVE_UCHARDET
+libsubtitle_plugin_la_CFLAGS = -DHAVE_UCHARDET
+libsubtitle_plugin_la_LIBADD = $(LIBS_uchardet)
+else
+libsubtitle_plugin_la_LIBADD =
+endif
+libsubtitle_plugin_la_LIBADD += $(LIBM)
 demux_LTLIBRARIES += libsubtitle_plugin.la
 
 libty_plugin_la_SOURCES = demux/ty.c codec/cc.h \
diff --git a/modules/demux/subtitle.c b/modules/demux/subtitle.c
index c715402da4..304a65f29a 100644
--- a/modules/demux/subtitle.c
+++ b/modules/demux/subtitle.c
@@ -41,6 +41,11 @@
 #include <vlc_demux.h>
 #include <vlc_charset.h>
 
+#ifdef HAVE_UCHARDET
+# include <uchardet/uchardet.h>
+# define SUBTITLE_C_NEED_MERGE_TEXT
+#endif
+
 /*****************************************************************************
  * Module descriptor
  *****************************************************************************/
@@ -51,6 +56,8 @@ static void Close( vlc_object_t *p_this );
     N_("Force the subtiles format. Selecting \"auto\" means autodetection and should always work.")
 #define SUB_DESCRIPTION_LONGTEXT \
     N_("Override the default track description.")
+#define SUB_DETECT_CHARSET_LONGTEXT \
+    N_("Try to auto-detect subtitles character encoding (with heuristic).")
 
 static const char *const ppsz_sub_type[] =
 {
@@ -71,6 +78,8 @@ vlc_module_begin ()
         change_string_list( ppsz_sub_type, ppsz_sub_type )
     add_string( "sub-description", NULL, N_("Subtitle description"),
                 SUB_DESCRIPTION_LONGTEXT, true )
+    add_bool( "sub-autodetect-charset", true, N_("Auto-detect subtitles encoding"),
+                SUB_DETECT_CHARSET_LONGTEXT, true );
     set_callbacks( Open, Close )
 
     add_shortcut( "subtitle" )
@@ -237,6 +246,10 @@ static int Control( demux_t *, int, va_list );
 static void Fix( demux_t * );
 static char * get_language_from_filename( const char * );
 
+#ifdef HAVE_UCHARDET
+static char * DetectCharset( text_t *txt );
+#endif
+
 /*****************************************************************************
  * Decoder format output function
  *****************************************************************************/
@@ -661,8 +674,6 @@ static int Open ( vlc_object_t *p_this )
 
         p_sys->subtitles.i_count++;
     }
-    /* Unload */
-    TextUnload( &txtlines );
 
     msg_Dbg(p_demux, "loaded %zu subtitles", p_sys->subtitles.i_count );
 
@@ -682,6 +693,25 @@ static int Open ( vlc_object_t *p_this )
     else
         es_format_Init( &fmt, SPU_ES, VLC_CODEC_SUBT );
 
+    /* Try to detect subtitles charset */
+    bool b_charset_autodetection = var_InheritBool( p_demux, "sub-autodetect-charset" );
+    if ( b_charset_autodetection && NULL == fmt.subs.psz_encoding ) {
+#ifdef HAVE_UCHARDET
+        char *charset = DetectCharset( &txtlines );
+        if ( NULL != charset ) {
+            msg_Info( p_demux, "auto-detected charset: %s", charset );
+            fmt.subs.psz_encoding = charset;
+        } else {
+            msg_Info( p_demux, "charset auto-detection failed" );
+        }
+#else /* !HAVE_UCHARDET */
+        msg_Warn( p_demux, "charset auto-detection enabled, but uchardet not linked, skipping detection" );
+#endif
+    }
+
+    /* Unload subtitles text */
+    TextUnload( &txtlines );
+
     p_sys->subtitles.i_current = 0;
     p_sys->i_length = 0;
     if( p_sys->subtitles.i_count > 0 )
@@ -944,6 +974,7 @@ static int TextLoad( text_t *txt, stream_t *s )
 
     return VLC_SUCCESS;
 }
+
 static void TextUnload( text_t *txt )
 {
     if( txt->i_line_count )
@@ -963,12 +994,19 @@ static char *TextGetLine( text_t *txt )
 
     return txt->line[txt->i_line++];
 }
+
 static void TextPreviousLine( text_t *txt )
 {
     if( txt->i_line > 0 )
         txt->i_line--;
 }
 
+#ifdef SUBTITLE_C_NEED_MERGE_TEXT
+static void TextResetLine( text_t *txt ) {
+    txt->i_line = 0;
+}
+#endif
+
 /*****************************************************************************
  * Specific Subtitle function
  *****************************************************************************/
@@ -2441,3 +2479,65 @@ static char * get_language_from_filename( const char * psz_sub_file )
     free( psz_work );
     return psz_ret;
 }
+
+
+#ifdef SUBTITLE_C_NEED_MERGE_TEXT
+
+static char * MergeTxtLines( text_t *txt ) {
+    char *psz_merged = malloc( 1 );
+    size_t i_merged_len = 0;
+    psz_merged[i_merged_len] = '\0';
+
+    TextResetLine( txt );
+    for ( char *psz_line = TextGetLine( txt );
+          NULL != psz_line;
+          psz_line = TextGetLine( txt ) ) {
+
+        size_t i_line_len = strlen( psz_line );
+
+        psz_merged = realloc( psz_merged, i_merged_len + i_line_len + 1 );
+        if ( NULL == psz_merged ) {
+            return NULL;
+        }
+
+        /* strcat( (dst + dst_len), src ) instead of simple strcat( dst, src )
+           optimizes text concat to O(N) instead of O(N^2) */
+        strcat( (psz_merged + i_merged_len), psz_line );
+        i_merged_len += i_line_len;
+    }
+    TextResetLine( txt );
+
+    return psz_merged;
+}
+
+#endif /* SUBTITLE_C_NEED_MERGE_TEXT */
+
+
+#ifdef HAVE_UCHARDET
+
+static char * DetectCharset( text_t *txt ) {
+    uchardet_t ud = uchardet_new();
+
+    /* subtitles lines are merged because
+       uchardet's full-text result is better than line-by-line result */
+    char *psz_text = MergeTxtLines( txt );
+
+    uchardet_handle_data( ud, psz_text, strlen( psz_text ) );
+    uchardet_data_end( ud );
+
+    char *psz_detected_charset = (char *) uchardet_get_charset( ud );
+    if ( 0 == strcmp( psz_detected_charset, "" )
+         || 0 == strcmp (psz_detected_charset, "ASCII" ) ) {
+
+        psz_detected_charset = NULL;
+    } else {
+        /* uchardet's result will be freed on uchardet_delete() => strdup */
+        psz_detected_charset = strdup( psz_detected_charset );
+    }
+
+    uchardet_delete( ud );
+
+    return psz_detected_charset;
+}
+
+#endif /* HAVE_UCHARDET */
-- 
2.20.1



More information about the vlc-devel mailing list