[vlc-devel] [PATCH 2/2] taglib: detect charset when ID3v2 Latin-1 parser finds invalid character
sojulibra at gmail.com
sojulibra at gmail.com
Fri Oct 23 12:46:56 CEST 2020
From: Souju TANAKA <sojulibra at gmail.com>
Changed TagLib Latin-1 parser to check whether a ISO 8859-1 encoded ID3v2 tag
is a valid byte sequence. If invalid Latin-1 character is found, try to detect
charset and convert the tag into UTF-8 to avoid Mojibake.
Some encoder embeds ID3v2 in unexpected charset, though it is againt the spec.
TagLib allows to overide TagLib::ID3v2::Latin1StringHandler::parse() to deal
with this practical situation.
---
include/vlc_charset.h | 20 +++++++++++
modules/meta_engine/Makefile.am | 2 +-
modules/meta_engine/taglib.cpp | 63 +++++++++++++++++++++++++++++++++
3 files changed, 84 insertions(+), 1 deletion(-)
diff --git a/include/vlc_charset.h b/include/vlc_charset.h
index 0ec1734dc9..311856913e 100644
--- a/include/vlc_charset.h
+++ b/include/vlc_charset.h
@@ -93,6 +93,26 @@ VLC_USED static inline const char *IsASCII(const char *str)
return str;
}
+/**
+ * Checks ISO/IEC 8859-1 validity.
+ *
+ * Checks whether a null-terminated string is a valid ISO/IEC 8859-1 bytes sequence
+ *
+ * \param str string to check
+ *
+ * \retval str the string is a valid null-terminated ISO/IEC 8859-1 sequence
+ * \retval NULL the string is not an ISO/IEC 8859-1 sequence
+ */
+VLC_USED static inline const char *IsLatin1(const char *str)
+{
+ unsigned char c;
+
+ for (const char *p = str; (c = *p) != '\0'; p++)
+ if (unlikely(c < 0x20 || (c > 0x7e && c < 0xa0)))
+ return NULL;
+ return str;
+}
+
/**
* Removes non-UTF-8 sequences.
*
diff --git a/modules/meta_engine/Makefile.am b/modules/meta_engine/Makefile.am
index f74d007d76..73479a64b0 100644
--- a/modules/meta_engine/Makefile.am
+++ b/modules/meta_engine/Makefile.am
@@ -7,6 +7,6 @@ libtaglib_plugin_la_SOURCES = meta_engine/taglib.cpp \
demux/xiph_metadata.h demux/xiph_metadata.c
libtaglib_plugin_la_CXXFLAGS = $(AM_CXXFLAGS) $(TAGLIB_CFLAGS)
libtaglib_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(metadir)'
-libtaglib_plugin_la_LIBADD = $(TAGLIB_LIBS) -lz
+libtaglib_plugin_la_LIBADD = $(TAGLIB_LIBS) -lz -luchardet
EXTRA_LTLIBRARIES += libtaglib_plugin.la
meta_LTLIBRARIES += $(LTLIBtaglib)
diff --git a/modules/meta_engine/taglib.cpp b/modules/meta_engine/taglib.cpp
index 480968edfd..952ec8269b 100644
--- a/modules/meta_engine/taglib.cpp
+++ b/modules/meta_engine/taglib.cpp
@@ -68,6 +68,7 @@
#include <asffile.h>
#include <apetag.h>
#include <flacfile.h>
+#include <iconv.h>
#include <mpcfile.h>
#include <mpegfile.h>
#include <mp4file.h>
@@ -81,6 +82,7 @@
#include <speexfile.h>
#include <trueaudiofile.h>
+#include <uchardet/uchardet.h>
#include <vorbisfile.h>
#include <wavpackfile.h>
@@ -142,6 +144,7 @@ static vlc::threads::mutex taglib_lock;
// Local functions
static int ReadMeta ( vlc_object_t * );
static int WriteMeta ( vlc_object_t * );
+static char *TryDetectCharset( const char * );
vlc_module_begin ()
set_capability( "meta reader", 1000 )
@@ -278,6 +281,41 @@ private:
long m_seqReadLimit;
};
+class Latin1StringHandlerWithCharsetDetection : public ID3v2::Latin1StringHandler
+{
+public:
+ String parse(const ByteVector &data) const
+ {
+ String str( data, String::Latin1 );
+ if ( IsLatin1( str.toCString( false ) ) )
+ return str;
+
+ static constexpr unsigned int i_minlen = 16;
+ String test_str = str;
+ if ( str.length() < i_minlen )
+ {
+ /* Lengthen for the better estimation */
+ assert(str.length() != 0);
+ for ( unsigned int i = 0; i < (i_minlen - 1) / str.length(); i++ )
+ test_str += str;
+ }
+ char *psz_charset = TryDetectCharset( test_str.toCString( false ) );
+ if ( psz_charset == NULL )
+ return str;
+
+ char *psz_utf8 = FromCharset( psz_charset, str.toCString( false ), str.length() );
+ free( psz_charset );
+ str = String( psz_utf8, String::UTF8 );
+ free( psz_utf8 );
+ return str;
+ }
+};
+
+namespace
+{
+ const Latin1StringHandlerWithCharsetDetection string_handler;
+}
+
static int ExtractCoupleNumberValues( vlc_meta_t* p_meta, const char *psz_value,
vlc_meta_type_t first, vlc_meta_type_t second)
{
@@ -915,6 +953,8 @@ static int ReadMeta( vlc_object_t* p_this)
else
s.setMaxSequentialRead( 1024 * 2048 );
#endif
+ ID3v2::Tag::setLatin1StringHandler(&string_handler);
+
f = FileRef( &s, false, AudioProperties::ReadStyle::Fast );
if( f.isNull() )
@@ -1350,3 +1390,26 @@ static int WriteMeta( vlc_object_t *p_this )
return VLC_SUCCESS;
}
+
+static char *TryDetectCharset( const char *str )
+{
+ uchardet_t ud = uchardet_new();
+
+ if( uchardet_handle_data( ud, str, strlen(str) ) != 0 )
+ {
+ uchardet_delete( ud );
+ return NULL;
+ }
+ uchardet_data_end( ud );
+
+ const char *psz_charset = uchardet_get_charset( ud );
+ if( psz_charset == NULL || *psz_charset == '\0' )
+ {
+ uchardet_delete( ud );
+ return NULL;
+ }
+ char *psz_ret = strdup(psz_charset);
+ uchardet_delete( ud );
+
+ return psz_ret;
+}
--
2.25.1
More information about the vlc-devel
mailing list