[vlc-devel] [PATCH 2/3] Detect subtitles charset using uchardet

Thomas Guillem thomas at gllm.fr
Mon Apr 8 09:43:14 CEST 2019


On Sun, Apr 7, 2019, at 20:39, pertuleha at gmail.com wrote:
> From: Aleksei Pertu <pertuleha at gmail.com>
> 
> ---
>  modules/demux/Makefile.am |   8 ++-
>  modules/demux/subtitle.c  | 104 +++++++++++++++++++++++++++++++++++++-
>  2 files changed, 109 insertions(+), 3 deletions(-)
> 
> diff --git a/modules/demux/Makefile.am b/modules/demux/Makefile.am
> index 85d4b3fba9..6802c4909f 100644
> --- a/modules/demux/Makefile.am
> +++ b/modules/demux/Makefile.am
> @@ -61,7 +61,13 @@ libmjpeg_plugin_la_SOURCES = demux/mjpeg.c 
> demux/mxpeg_helper.h
>  demux_LTLIBRARIES += libmjpeg_plugin.la
>  
>  libsubtitle_plugin_la_SOURCES = demux/subtitle.c
> -libsubtitle_plugin_la_LIBADD = $(LIBM)
> +if HAVE_UCHARDET
> +libsubtitle_plugin_la_CFLAGS = -DHAVE_UCHARDET
> +libsubtitle_plugin_la_LIBADD = $(LIBS_uchardet)

I don't think the conditional libsubtitle_plugin_la_LIBADD is needed, LIBS_uchardet will be empty if !HAVE_UCHARDET

> +else
> +libsubtitle_plugin_la_LIBADD =
> +endif
> +libsubtitle_plugin_la_LIBADD += $(LIBM)
>  demux_LTLIBRARIES += libsubtitle_plugin.la
>  
>  libty_plugin_la_SOURCES = demux/ty.c codec/cc.h \
> diff --git a/modules/demux/subtitle.c b/modules/demux/subtitle.c
> index c715402da4..304a65f29a 100644
> --- a/modules/demux/subtitle.c
> +++ b/modules/demux/subtitle.c
> @@ -41,6 +41,11 @@
>  #include <vlc_demux.h>
>  #include <vlc_charset.h>
>  
> +#ifdef HAVE_UCHARDET
> +# include <uchardet/uchardet.h>
> +# define SUBTITLE_C_NEED_MERGE_TEXT
> +#endif
> +
>  
> /*****************************************************************************
>   * Module descriptor
>   
> *****************************************************************************/
> @@ -51,6 +56,8 @@ static void Close( vlc_object_t *p_this );
>      N_("Force the subtiles format. Selecting \"auto\" means 
> autodetection and should always work.")
>  #define SUB_DESCRIPTION_LONGTEXT \
>      N_("Override the default track description.")
> +#define SUB_DETECT_CHARSET_LONGTEXT \
> +    N_("Try to auto-detect subtitles character encoding (with 
> heuristic).")
>  
>  static const char *const ppsz_sub_type[] =
>  {
> @@ -71,6 +78,8 @@ vlc_module_begin ()
>          change_string_list( ppsz_sub_type, ppsz_sub_type )
>      add_string( "sub-description", NULL, N_("Subtitle description"),
>                  SUB_DESCRIPTION_LONGTEXT, true )
> +    add_bool( "sub-autodetect-charset", true, N_("Auto-detect 
> subtitles encoding"),
> +                SUB_DETECT_CHARSET_LONGTEXT, true );
>      set_callbacks( Open, Close )
>  
>      add_shortcut( "subtitle" )
> @@ -237,6 +246,10 @@ static int Control( demux_t *, int, va_list );
>  static void Fix( demux_t * );
>  static char * get_language_from_filename( const char * );
>  
> +#ifdef HAVE_UCHARDET
> +static char * DetectCharset( text_t *txt );
> +#endif
> +
>  /*****************************************************************************
>   * Decoder format output function
>   *****************************************************************************/
> @@ -661,8 +674,6 @@ static int Open ( vlc_object_t *p_this )
>  
>          p_sys->subtitles.i_count++;
>      }
> -    /* Unload */
> -    TextUnload( &txtlines );
>  
>      msg_Dbg(p_demux, "loaded %zu subtitles", p_sys->subtitles.i_count );
>  
> @@ -682,6 +693,25 @@ static int Open ( vlc_object_t *p_this )
>      else
>          es_format_Init( &fmt, SPU_ES, VLC_CODEC_SUBT );
>  
> +    /* Try to detect subtitles charset */
> +    bool b_charset_autodetection = var_InheritBool( p_demux, 
> "sub-autodetect-charset" );
> +    if ( b_charset_autodetection && NULL == fmt.subs.psz_encoding ) {
> +#ifdef HAVE_UCHARDET
> +        char *charset = DetectCharset( &txtlines );
> +        if ( NULL != charset ) {
> +            msg_Info( p_demux, "auto-detected charset: %s", charset );
> +            fmt.subs.psz_encoding = charset;
> +        } else {
> +            msg_Info( p_demux, "charset auto-detection failed" );
> +        }
> +#else /* !HAVE_UCHARDET */
> +        msg_Warn( p_demux, "charset auto-detection enabled, but 
> uchardet not linked, skipping detection" );
> +#endif
> +    }
> +
> +    /* Unload subtitles text */
> +    TextUnload( &txtlines );
> +
>      p_sys->subtitles.i_current = 0;
>      p_sys->i_length = 0;
>      if( p_sys->subtitles.i_count > 0 )
> @@ -944,6 +974,7 @@ static int TextLoad( text_t *txt, stream_t *s )
>  
>      return VLC_SUCCESS;
>  }
> +
>  static void TextUnload( text_t *txt )
>  {
>      if( txt->i_line_count )
> @@ -963,12 +994,19 @@ static char *TextGetLine( text_t *txt )
>  
>      return txt->line[txt->i_line++];
>  }
> +
>  static void TextPreviousLine( text_t *txt )
>  {
>      if( txt->i_line > 0 )
>          txt->i_line--;
>  }
>  
> +#ifdef SUBTITLE_C_NEED_MERGE_TEXT
> +static void TextResetLine( text_t *txt ) {
> +    txt->i_line = 0;
> +}

Possible to put this function in the SUBTITLE_C_NEED_MERGE_TEXT block bellow ?

> +#endif
> +
>  
> /*****************************************************************************
>   * Specific Subtitle function
>   
> *****************************************************************************/
> @@ -2441,3 +2479,65 @@ static char * get_language_from_filename( const 
> char * psz_sub_file )
>      free( psz_work );
>      return psz_ret;
>  }
> +
> +
> +#ifdef SUBTITLE_C_NEED_MERGE_TEXT
> +
> +static char * MergeTxtLines( text_t *txt ) {
> +    char *psz_merged = malloc( 1 );
> +    size_t i_merged_len = 0;
> +    psz_merged[i_merged_len] = '\0';
> +
> +    TextResetLine( txt );
> +    for ( char *psz_line = TextGetLine( txt );
> +          NULL != psz_line;
> +          psz_line = TextGetLine( txt ) ) {
> +
> +        size_t i_line_len = strlen( psz_line );
> +
> +        psz_merged = realloc( psz_merged, i_merged_len + i_line_len + 
> 1 );
> +        if ( NULL == psz_merged ) {
> +            return NULL;
> +        }
> +
> +        /* strcat( (dst + dst_len), src ) instead of simple strcat( 
> dst, src )
> +           optimizes text concat to O(N) instead of O(N^2) */
> +        strcat( (psz_merged + i_merged_len), psz_line );
> +        i_merged_len += i_line_len;
> +    }
> +    TextResetLine( txt );
> +
> +    return psz_merged;
> +}
> +
> +#endif /* SUBTITLE_C_NEED_MERGE_TEXT */
> +
> +
> +#ifdef HAVE_UCHARDET
> +
> +static char * DetectCharset( text_t *txt ) {
> +    uchardet_t ud = uchardet_new();
> +
> +    /* subtitles lines are merged because
> +       uchardet's full-text result is better than line-by-line result 
> */
> +    char *psz_text = MergeTxtLines( txt );
> +
> +    uchardet_handle_data( ud, psz_text, strlen( psz_text ) );
> +    uchardet_data_end( ud );
> +
> +    char *psz_detected_charset = (char *) uchardet_get_charset( ud );
> +    if ( 0 == strcmp( psz_detected_charset, "" )
> +         || 0 == strcmp (psz_detected_charset, "ASCII" ) ) {
> +
> +        psz_detected_charset = NULL;
> +    } else {
> +        /* uchardet's result will be freed on uchardet_delete() => 
> strdup */
> +        psz_detected_charset = strdup( psz_detected_charset );
> +    }
> +
> +    uchardet_delete( ud );
> +
> +    return psz_detected_charset;
> +}
> +
> +#endif /* HAVE_UCHARDET */

OK, with the rest of the patch.

> -- 
> 2.20.1
> 
> _______________________________________________
> vlc-devel mailing list
> To unsubscribe or modify your subscription options:
> https://mailman.videolan.org/listinfo/vlc-devel


More information about the vlc-devel mailing list