[vlc-devel] [PATCH 2/3] Detect subtitles charset using uchardet
Thomas Guillem
thomas at gllm.fr
Mon Apr 8 09:43:14 CEST 2019
On Sun, Apr 7, 2019, at 20:39, pertuleha at gmail.com wrote:
> From: Aleksei Pertu <pertuleha at gmail.com>
>
> ---
> modules/demux/Makefile.am | 8 ++-
> modules/demux/subtitle.c | 104 +++++++++++++++++++++++++++++++++++++-
> 2 files changed, 109 insertions(+), 3 deletions(-)
>
> diff --git a/modules/demux/Makefile.am b/modules/demux/Makefile.am
> index 85d4b3fba9..6802c4909f 100644
> --- a/modules/demux/Makefile.am
> +++ b/modules/demux/Makefile.am
> @@ -61,7 +61,13 @@ libmjpeg_plugin_la_SOURCES = demux/mjpeg.c
> demux/mxpeg_helper.h
> demux_LTLIBRARIES += libmjpeg_plugin.la
>
> libsubtitle_plugin_la_SOURCES = demux/subtitle.c
> -libsubtitle_plugin_la_LIBADD = $(LIBM)
> +if HAVE_UCHARDET
> +libsubtitle_plugin_la_CFLAGS = -DHAVE_UCHARDET
> +libsubtitle_plugin_la_LIBADD = $(LIBS_uchardet)
I don't think the conditional libsubtitle_plugin_la_LIBADD is needed, LIBS_uchardet will be empty if !HAVE_UCHARDET
> +else
> +libsubtitle_plugin_la_LIBADD =
> +endif
> +libsubtitle_plugin_la_LIBADD += $(LIBM)
> demux_LTLIBRARIES += libsubtitle_plugin.la
>
> libty_plugin_la_SOURCES = demux/ty.c codec/cc.h \
> diff --git a/modules/demux/subtitle.c b/modules/demux/subtitle.c
> index c715402da4..304a65f29a 100644
> --- a/modules/demux/subtitle.c
> +++ b/modules/demux/subtitle.c
> @@ -41,6 +41,11 @@
> #include <vlc_demux.h>
> #include <vlc_charset.h>
>
> +#ifdef HAVE_UCHARDET
> +# include <uchardet/uchardet.h>
> +# define SUBTITLE_C_NEED_MERGE_TEXT
> +#endif
> +
>
> /*****************************************************************************
> * Module descriptor
>
> *****************************************************************************/
> @@ -51,6 +56,8 @@ static void Close( vlc_object_t *p_this );
> N_("Force the subtiles format. Selecting \"auto\" means
> autodetection and should always work.")
> #define SUB_DESCRIPTION_LONGTEXT \
> N_("Override the default track description.")
> +#define SUB_DETECT_CHARSET_LONGTEXT \
> + N_("Try to auto-detect subtitles character encoding (with
> heuristic).")
>
> static const char *const ppsz_sub_type[] =
> {
> @@ -71,6 +78,8 @@ vlc_module_begin ()
> change_string_list( ppsz_sub_type, ppsz_sub_type )
> add_string( "sub-description", NULL, N_("Subtitle description"),
> SUB_DESCRIPTION_LONGTEXT, true )
> + add_bool( "sub-autodetect-charset", true, N_("Auto-detect
> subtitles encoding"),
> + SUB_DETECT_CHARSET_LONGTEXT, true );
> set_callbacks( Open, Close )
>
> add_shortcut( "subtitle" )
> @@ -237,6 +246,10 @@ static int Control( demux_t *, int, va_list );
> static void Fix( demux_t * );
> static char * get_language_from_filename( const char * );
>
> +#ifdef HAVE_UCHARDET
> +static char * DetectCharset( text_t *txt );
> +#endif
> +
> /*****************************************************************************
> * Decoder format output function
> *****************************************************************************/
> @@ -661,8 +674,6 @@ static int Open ( vlc_object_t *p_this )
>
> p_sys->subtitles.i_count++;
> }
> - /* Unload */
> - TextUnload( &txtlines );
>
> msg_Dbg(p_demux, "loaded %zu subtitles", p_sys->subtitles.i_count );
>
> @@ -682,6 +693,25 @@ static int Open ( vlc_object_t *p_this )
> else
> es_format_Init( &fmt, SPU_ES, VLC_CODEC_SUBT );
>
> + /* Try to detect subtitles charset */
> + bool b_charset_autodetection = var_InheritBool( p_demux,
> "sub-autodetect-charset" );
> + if ( b_charset_autodetection && NULL == fmt.subs.psz_encoding ) {
> +#ifdef HAVE_UCHARDET
> + char *charset = DetectCharset( &txtlines );
> + if ( NULL != charset ) {
> + msg_Info( p_demux, "auto-detected charset: %s", charset );
> + fmt.subs.psz_encoding = charset;
> + } else {
> + msg_Info( p_demux, "charset auto-detection failed" );
> + }
> +#else /* !HAVE_UCHARDET */
> + msg_Warn( p_demux, "charset auto-detection enabled, but
> uchardet not linked, skipping detection" );
> +#endif
> + }
> +
> + /* Unload subtitles text */
> + TextUnload( &txtlines );
> +
> p_sys->subtitles.i_current = 0;
> p_sys->i_length = 0;
> if( p_sys->subtitles.i_count > 0 )
> @@ -944,6 +974,7 @@ static int TextLoad( text_t *txt, stream_t *s )
>
> return VLC_SUCCESS;
> }
> +
> static void TextUnload( text_t *txt )
> {
> if( txt->i_line_count )
> @@ -963,12 +994,19 @@ static char *TextGetLine( text_t *txt )
>
> return txt->line[txt->i_line++];
> }
> +
> static void TextPreviousLine( text_t *txt )
> {
> if( txt->i_line > 0 )
> txt->i_line--;
> }
>
> +#ifdef SUBTITLE_C_NEED_MERGE_TEXT
> +static void TextResetLine( text_t *txt ) {
> + txt->i_line = 0;
> +}
Possible to put this function in the SUBTITLE_C_NEED_MERGE_TEXT block bellow ?
> +#endif
> +
>
> /*****************************************************************************
> * Specific Subtitle function
>
> *****************************************************************************/
> @@ -2441,3 +2479,65 @@ static char * get_language_from_filename( const
> char * psz_sub_file )
> free( psz_work );
> return psz_ret;
> }
> +
> +
> +#ifdef SUBTITLE_C_NEED_MERGE_TEXT
> +
> +static char * MergeTxtLines( text_t *txt ) {
> + char *psz_merged = malloc( 1 );
> + size_t i_merged_len = 0;
> + psz_merged[i_merged_len] = '\0';
> +
> + TextResetLine( txt );
> + for ( char *psz_line = TextGetLine( txt );
> + NULL != psz_line;
> + psz_line = TextGetLine( txt ) ) {
> +
> + size_t i_line_len = strlen( psz_line );
> +
> + psz_merged = realloc( psz_merged, i_merged_len + i_line_len +
> 1 );
> + if ( NULL == psz_merged ) {
> + return NULL;
> + }
> +
> + /* strcat( (dst + dst_len), src ) instead of simple strcat(
> dst, src )
> + optimizes text concat to O(N) instead of O(N^2) */
> + strcat( (psz_merged + i_merged_len), psz_line );
> + i_merged_len += i_line_len;
> + }
> + TextResetLine( txt );
> +
> + return psz_merged;
> +}
> +
> +#endif /* SUBTITLE_C_NEED_MERGE_TEXT */
> +
> +
> +#ifdef HAVE_UCHARDET
> +
> +static char * DetectCharset( text_t *txt ) {
> + uchardet_t ud = uchardet_new();
> +
> + /* subtitles lines are merged because
> + uchardet's full-text result is better than line-by-line result
> */
> + char *psz_text = MergeTxtLines( txt );
> +
> + uchardet_handle_data( ud, psz_text, strlen( psz_text ) );
> + uchardet_data_end( ud );
> +
> + char *psz_detected_charset = (char *) uchardet_get_charset( ud );
> + if ( 0 == strcmp( psz_detected_charset, "" )
> + || 0 == strcmp (psz_detected_charset, "ASCII" ) ) {
> +
> + psz_detected_charset = NULL;
> + } else {
> + /* uchardet's result will be freed on uchardet_delete() =>
> strdup */
> + psz_detected_charset = strdup( psz_detected_charset );
> + }
> +
> + uchardet_delete( ud );
> +
> + return psz_detected_charset;
> +}
> +
> +#endif /* HAVE_UCHARDET */
OK, with the rest of the patch.
> --
> 2.20.1
>
> _______________________________________________
> vlc-devel mailing list
> To unsubscribe or modify your subscription options:
> https://mailman.videolan.org/listinfo/vlc-devel
More information about the vlc-devel
mailing list