[vlc-devel] [PATCH 2/2] taglib: detect charset when ID3v2 Latin-1 parser finds invalid character

sojulibra at gmail.com sojulibra at gmail.com
Fri Oct 23 12:46:56 CEST 2020


From: Souju TANAKA <sojulibra at gmail.com>

Changed TagLib Latin-1 parser to check whether a ISO 8859-1 encoded ID3v2 tag
is a valid byte sequence. If invalid Latin-1 character is found, try to detect
charset and convert the tag into UTF-8 to avoid Mojibake.

Some encoder embeds ID3v2 in unexpected charset, though it is againt the spec.
TagLib allows to overide TagLib::ID3v2::Latin1StringHandler::parse() to deal
with this practical situation.
---
 include/vlc_charset.h           | 20 +++++++++++
 modules/meta_engine/Makefile.am |  2 +-
 modules/meta_engine/taglib.cpp  | 63 +++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/include/vlc_charset.h b/include/vlc_charset.h
index 0ec1734dc9..311856913e 100644
--- a/include/vlc_charset.h
+++ b/include/vlc_charset.h
@@ -93,6 +93,26 @@ VLC_USED static inline const char *IsASCII(const char *str)
     return str;
 }
 
+/**
+ * Checks ISO/IEC 8859-1 validity.
+ *
+ * Checks whether a null-terminated string is a valid ISO/IEC 8859-1 bytes sequence
+ *
+ * \param str string to check
+ *
+ * \retval str the string is a valid null-terminated ISO/IEC 8859-1 sequence
+ * \retval NULL the string is not an ISO/IEC 8859-1 sequence
+ */
+VLC_USED static inline const char *IsLatin1(const char *str)
+{
+    unsigned char c;
+
+    for (const char *p = str; (c = *p) != '\0'; p++)
+        if (unlikely(c < 0x20 || (c > 0x7e && c < 0xa0)))
+            return NULL;
+    return str;
+}
+
 /**
  * Removes non-UTF-8 sequences.
  *
diff --git a/modules/meta_engine/Makefile.am b/modules/meta_engine/Makefile.am
index f74d007d76..73479a64b0 100644
--- a/modules/meta_engine/Makefile.am
+++ b/modules/meta_engine/Makefile.am
@@ -7,6 +7,6 @@ libtaglib_plugin_la_SOURCES = meta_engine/taglib.cpp \
 	demux/xiph_metadata.h demux/xiph_metadata.c
 libtaglib_plugin_la_CXXFLAGS = $(AM_CXXFLAGS) $(TAGLIB_CFLAGS)
 libtaglib_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(metadir)'
-libtaglib_plugin_la_LIBADD = $(TAGLIB_LIBS) -lz
+libtaglib_plugin_la_LIBADD = $(TAGLIB_LIBS) -lz -luchardet
 EXTRA_LTLIBRARIES += libtaglib_plugin.la
 meta_LTLIBRARIES += $(LTLIBtaglib)
diff --git a/modules/meta_engine/taglib.cpp b/modules/meta_engine/taglib.cpp
index 480968edfd..952ec8269b 100644
--- a/modules/meta_engine/taglib.cpp
+++ b/modules/meta_engine/taglib.cpp
@@ -68,6 +68,7 @@
 #include <asffile.h>
 #include <apetag.h>
 #include <flacfile.h>
+#include <iconv.h>
 #include <mpcfile.h>
 #include <mpegfile.h>
 #include <mp4file.h>
@@ -81,6 +82,7 @@
 
 #include <speexfile.h>
 #include <trueaudiofile.h>
+#include <uchardet/uchardet.h>
 #include <vorbisfile.h>
 #include <wavpackfile.h>
 
@@ -142,6 +144,7 @@ static vlc::threads::mutex taglib_lock;
 // Local functions
 static int ReadMeta    ( vlc_object_t * );
 static int WriteMeta   ( vlc_object_t * );
+static char *TryDetectCharset( const char * );
 
 vlc_module_begin ()
     set_capability( "meta reader", 1000 )
@@ -278,6 +281,41 @@ private:
     long m_seqReadLimit;
 };
 
+class Latin1StringHandlerWithCharsetDetection : public ID3v2::Latin1StringHandler
+{
+public:
+    String parse(const ByteVector &data) const
+    {
+        String str( data, String::Latin1 );
+        if ( IsLatin1( str.toCString( false ) ) )
+            return str;
+
+        static constexpr unsigned int i_minlen = 16;
+        String test_str = str;
+        if ( str.length() < i_minlen )
+        {
+            /* Lengthen for the better estimation */
+            assert(str.length() != 0);
+            for ( unsigned int i = 0; i < (i_minlen - 1) / str.length(); i++ )
+                test_str += str;
+        }
+        char *psz_charset = TryDetectCharset( test_str.toCString( false ) );
+        if ( psz_charset == NULL )
+            return str;
+
+        char *psz_utf8 = FromCharset( psz_charset, str.toCString( false ), str.length() );
+        free( psz_charset );
+        str = String( psz_utf8, String::UTF8 );
+        free( psz_utf8 );
+        return str;
+    }
+};
+
+namespace
+{
+    const Latin1StringHandlerWithCharsetDetection string_handler;
+}
+
 static int ExtractCoupleNumberValues( vlc_meta_t* p_meta, const char *psz_value,
         vlc_meta_type_t first, vlc_meta_type_t second)
 {
@@ -915,6 +953,8 @@ static int ReadMeta( vlc_object_t* p_this)
     else
         s.setMaxSequentialRead( 1024 * 2048 );
 #endif
+    ID3v2::Tag::setLatin1StringHandler(&string_handler);
+
     f = FileRef( &s, false, AudioProperties::ReadStyle::Fast );
 
     if( f.isNull() )
@@ -1350,3 +1390,26 @@ static int WriteMeta( vlc_object_t *p_this )
 
     return VLC_SUCCESS;
 }
+
+static char *TryDetectCharset( const char *str )
+{
+    uchardet_t ud = uchardet_new();
+
+    if( uchardet_handle_data( ud, str, strlen(str) ) != 0 )
+    {
+        uchardet_delete( ud );
+        return NULL;
+    }
+    uchardet_data_end( ud );
+
+    const char *psz_charset = uchardet_get_charset( ud );
+    if( psz_charset == NULL || *psz_charset == '\0' )
+    {
+        uchardet_delete( ud );
+        return NULL;
+    }
+    char *psz_ret = strdup(psz_charset);
+    uchardet_delete( ud );
+
+    return psz_ret;
+}
-- 
2.25.1



More information about the vlc-devel mailing list