[vlc-commits] subtitle: deal with initial UTF-8 BOM
Rémi Denis-Courmont
git at videolan.org
Wed Jun 27 15:03:01 CEST 2012
vlc | branch: master | Rémi Denis-Courmont <remi at remlab.net> | Wed Jun 27 15:51:25 2012 +0300| [dc58170e84d728bd55d95da3c3f3884132ba4080] | committer: Rémi Denis-Courmont
subtitle: deal with initial UTF-8 BOM
If an UTF-8 BOM is found at the beginning of the text file, skip it
(it can confuse some parsers). Also mark the subtitle track explicitly
as UTF-8 encoded.
> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=dc58170e84d728bd55d95da3c3f3884132ba4080
---
modules/demux/subtitle.c | 20 +++++++++++++++-----
1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/modules/demux/subtitle.c b/modules/demux/subtitle.c
index 2c24282..a42c5a8 100644
--- a/modules/demux/subtitle.c
+++ b/modules/demux/subtitle.c
@@ -295,6 +295,17 @@ static int Open ( vlc_object_t *p_this )
}
free( psz_type );
+ /* Detect Unicode while skipping the UTF-8 Byte Order Mark */
+ bool unicode = false;
+ const uint8_t *p_data;
+ if( stream_Peek( p_demux->s, &p_data, 3 ) >= 3
+ && !memcmp( p_data, "\xEF\xBB\xBF", 3 ) )
+ {
+ unicode = true;
+ stream_Seek( p_demux->s, 3 ); /* skip BOM */
+ msg_Dbg( p_demux, "detected Unicode Byte Order Mark" );
+ }
+
/* Probe if unknown type */
if( p_sys->i_type == SUB_TYPE_UNKNOWN )
{
@@ -442,15 +453,14 @@ static int Open ( vlc_object_t *p_this )
/* It will nearly always work even for non seekable stream thanks the
* caching system, and if it fails we lose just a few sub */
- if( stream_Seek( p_demux->s, 0 ) )
- {
+ if( stream_Seek( p_demux->s, unicode ? 3 : 0 ) )
msg_Warn( p_demux, "failed to rewind" );
- }
}
/* Quit on unknown subtitles */
if( p_sys->i_type == SUB_TYPE_UNKNOWN )
{
+ stream_Seek( p_demux->s, 0 );
msg_Warn( p_demux, "failed to recognize subtitle type" );
free( p_sys );
return VLC_EGENERIC;
@@ -518,9 +528,9 @@ static int Open ( vlc_object_t *p_this )
es_format_Init( &fmt, SPU_ES, VLC_CODEC_SSA );
}
else
- {
es_format_Init( &fmt, SPU_ES, VLC_CODEC_SUBT );
- }
+ if( unicode )
+ fmt.subs.psz_encoding = strdup( "UTF-8" );
char *psz_description = var_InheritString( p_demux, "sub-description" );
if( psz_description && *psz_description )
fmt.psz_description = psz_description;
More information about the vlc-commits
mailing list