[vlc-devel] [PATCH 1/1] Add subtitle encoding detection with uchardet

pertuleha at gmail.com pertuleha at gmail.com
Thu Apr 4 21:01:48 CEST 2019


From: Aleksei Pertu <pertuleha at gmail.com>

Detect subtitle encoding, using uchardet library
Add UI setting for QT/macOS simple preferences
---
 configure.ac                                  | 10 +++
 contrib/src/uchardet/SHA512SUMS               |  1 +
 contrib/src/uchardet/rules.mak                | 25 ++++++
 extras/package/macosx/env.build.sh            |  8 +-
 modules/demux/Makefile.am                     |  2 +-
 modules/demux/subtitle.c                      | 83 ++++++++++++++++++-
 modules/gui/macosx/UI/SimplePreferences.xib   | 35 +++++---
 .../preferences/VLCSimplePrefsController.h    |  1 +
 .../preferences/VLCSimplePrefsController.m    |  2 +
 .../gui/qt/components/simple_preferences.cpp  |  1 +
 modules/gui/qt/ui/sprefs_subtitles.ui         |  7 ++
 11 files changed, 160 insertions(+), 15 deletions(-)
 create mode 100644 contrib/src/uchardet/SHA512SUMS
 create mode 100644 contrib/src/uchardet/rules.mak

diff --git a/configure.ac b/configure.ac
index adf61a1929..349a76c835 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4002,6 +4002,16 @@ AS_IF([test "${enable_skins2}" = "yes" && test "${enable_libtar}" != "no"], [
   ])
 ])
 
+dnl
+dnl  uchardet library for character encoding detection
+dnl
+AC_CHECK_HEADER([uchardet/uchardet.h], [ have_uchardet=yes ], [ have_uchardet=no ])
+AM_CONDITIONAL([HAVE_UCHARDET], [ test "${have_uchardet}" = "yes" ])
+if test "${have_uchardet}" = "yes"
+then
+ VLC_ADD_LIBS([uchardet], [-luchardet -lstdc++])
+fi
+
 dnl
 dnl  MacOS X gui module
 dnl
diff --git a/contrib/src/uchardet/SHA512SUMS b/contrib/src/uchardet/SHA512SUMS
new file mode 100644
index 0000000000..5cbb1c2697
--- /dev/null
+++ b/contrib/src/uchardet/SHA512SUMS
@@ -0,0 +1 @@
+eceeadae060bf277e298d709856609dde32921271140dc1fb0a33c7b6e1381033fc2960d616ebbd82c92815936864d2c0743b1b5ea1b7d4a200df87df80d6de5  uchardet-0.0.6.tar.xz
diff --git a/contrib/src/uchardet/rules.mak b/contrib/src/uchardet/rules.mak
new file mode 100644
index 0000000000..5dd302be3a
--- /dev/null
+++ b/contrib/src/uchardet/rules.mak
@@ -0,0 +1,25 @@
+# uchardet
+
+UCHARDET_VERSION := 0.0.6
+UCHARDET_URL := https://www.freedesktop.org/software/uchardet/releases/uchardet-$(UCHARDET_VERSION).tar.xz
+
+PKGS += uchardet
+ifeq ($(call need_pkg,"uchardet >= 0.0.6"),)
+PKGS_FOUND += uchardet
+endif
+
+$(TARBALLS)/uchardet-$(UCHARDET_VERSION).tar.xz:
+	$(call download_pkg,$(UCHARDET_URL),uchardet)
+
+.sum-uchardet: uchardet-$(UCHARDET_VERSION).tar.xz
+
+uchardet: uchardet-$(UCHARDET_VERSION).tar.xz .sum-uchardet
+	$(UNPACK)
+	$(MOVE)
+
+.uchardet: uchardet toolchain.cmake
+	cd $< && $(HOSTVARS_PIC) $(CMAKE) \
+		-DBUILD_SHARED_LIBS:BOOL=OFF \
+		.
+	cd $< && $(MAKE) install
+	touch $@
diff --git a/extras/package/macosx/env.build.sh b/extras/package/macosx/env.build.sh
index ee422eecb3..1c0ae62c5c 100755
--- a/extras/package/macosx/env.build.sh
+++ b/extras/package/macosx/env.build.sh
@@ -42,7 +42,13 @@ vlcSetBaseEnvironment() {
 
     python3Path=$(echo /Library/Frameworks/Python.framework/Versions/3.*/bin | awk '{print $1;}')
     if [ ! -d "$python3Path" ]; then
-        python3Path=""
+        # in case of HomeBrew or manually installed python3
+        python3ExePath=$(which python3)
+        if [ -n "$python3ExePath" ]; then
+                python3Path=$(dirname "$python3ExePath")
+        else
+                python3Path=""
+        fi;
     fi
 
     export PATH="${VLC_ROOT_DIR}/extras/tools/build/bin:${VLC_ROOT_DIR}/contrib/${LOCAL_TRIPLET}/bin:$python3Path:${VLC_PATH}:/bin:/sbin:/usr/bin:/usr/sbin"
diff --git a/modules/demux/Makefile.am b/modules/demux/Makefile.am
index 85d4b3fba9..54302049b0 100644
--- a/modules/demux/Makefile.am
+++ b/modules/demux/Makefile.am
@@ -61,7 +61,7 @@ libmjpeg_plugin_la_SOURCES = demux/mjpeg.c demux/mxpeg_helper.h
 demux_LTLIBRARIES += libmjpeg_plugin.la
 
 libsubtitle_plugin_la_SOURCES = demux/subtitle.c
-libsubtitle_plugin_la_LIBADD = $(LIBM)
+libsubtitle_plugin_la_LIBADD = $(LIBS_uchardet) $(LIBM)
 demux_LTLIBRARIES += libsubtitle_plugin.la
 
 libty_plugin_la_SOURCES = demux/ty.c codec/cc.h \
diff --git a/modules/demux/subtitle.c b/modules/demux/subtitle.c
index c715402da4..6a95aa8a43 100644
--- a/modules/demux/subtitle.c
+++ b/modules/demux/subtitle.c
@@ -41,6 +41,8 @@
 #include <vlc_demux.h>
 #include <vlc_charset.h>
 
+#include <uchardet/uchardet.h>
+
 /*****************************************************************************
  * Module descriptor
  *****************************************************************************/
@@ -51,6 +53,8 @@ static void Close( vlc_object_t *p_this );
     N_("Force the subtiles format. Selecting \"auto\" means autodetection and should always work.")
 #define SUB_DESCRIPTION_LONGTEXT \
     N_("Override the default track description.")
+#define SUB_DETECT_CHARSET_LONGTEXT \
+    N_("Try to auto-detect subtitles character encoding (with heuristic).")
 
 static const char *const ppsz_sub_type[] =
 {
@@ -71,6 +75,8 @@ vlc_module_begin ()
         change_string_list( ppsz_sub_type, ppsz_sub_type )
     add_string( "sub-description", NULL, N_("Subtitle description"),
                 SUB_DESCRIPTION_LONGTEXT, true )
+    add_bool( "sub-autodetect-charset", true, N_("Auto-detect subtitles encoding"),
+                SUB_DETECT_CHARSET_LONGTEXT, true );
     set_callbacks( Open, Close )
 
     add_shortcut( "subtitle" )
@@ -237,6 +243,8 @@ static int Control( demux_t *, int, va_list );
 static void Fix( demux_t * );
 static char * get_language_from_filename( const char * );
 
+static char * DetectCharset( text_t *txt );
+
 /*****************************************************************************
  * Decoder format output function
  *****************************************************************************/
@@ -661,8 +669,6 @@ static int Open ( vlc_object_t *p_this )
 
         p_sys->subtitles.i_count++;
     }
-    /* Unload */
-    TextUnload( &txtlines );
 
     msg_Dbg(p_demux, "loaded %zu subtitles", p_sys->subtitles.i_count );
 
@@ -682,6 +688,21 @@ static int Open ( vlc_object_t *p_this )
     else
         es_format_Init( &fmt, SPU_ES, VLC_CODEC_SUBT );
 
+    /* Try to detect subtitles charset */
+    bool b_charset_autodetection = var_InheritBool( p_demux, "sub-autodetect-charset" );
+    if ( b_charset_autodetection && NULL == fmt.subs.psz_encoding ) {
+        char *charset = DetectCharset( &txtlines );
+        if ( NULL != charset ) {
+            msg_Info( p_demux, "auto-detected charset: %s", charset );
+            fmt.subs.psz_encoding = charset;
+        } else {
+            msg_Info( p_demux, "charset auto-detection failed" );
+        }
+    }
+
+    /* Unload subtitles text */
+    TextUnload( &txtlines );
+
     p_sys->subtitles.i_current = 0;
     p_sys->i_length = 0;
     if( p_sys->subtitles.i_count > 0 )
@@ -944,6 +965,7 @@ static int TextLoad( text_t *txt, stream_t *s )
 
     return VLC_SUCCESS;
 }
+
 static void TextUnload( text_t *txt )
 {
     if( txt->i_line_count )
@@ -963,12 +985,17 @@ static char *TextGetLine( text_t *txt )
 
     return txt->line[txt->i_line++];
 }
+
 static void TextPreviousLine( text_t *txt )
 {
     if( txt->i_line > 0 )
         txt->i_line--;
 }
 
+static void TextResetLine( text_t *txt ) {
+    txt->i_line = 0;
+}
+
 /*****************************************************************************
  * Specific Subtitle function
  *****************************************************************************/
@@ -2441,3 +2468,55 @@ static char * get_language_from_filename( const char * psz_sub_file )
     free( psz_work );
     return psz_ret;
 }
+
+static char * MergeTxtLines( text_t *txt ) {
+    char *psz_merged = malloc( 1 );
+    size_t i_merged_len = 0;
+    psz_merged[i_merged_len] = '\0';
+
+    TextResetLine( txt );
+    for ( char *psz_line = TextGetLine( txt );
+          NULL != psz_line;
+          psz_line = TextGetLine( txt ) ) {
+
+        size_t i_line_len = strlen( psz_line );
+
+        psz_merged = realloc( psz_merged, i_merged_len + i_line_len + 1 );
+        if ( NULL == psz_merged ) {
+            return NULL;
+        }
+
+        /* strcat( (dst + dst_len), src ) instead of simple strcat( dst, src )
+           optimizes text concat to O(N) instead of O(N^2) */
+        strcat( (psz_merged + i_merged_len), psz_line );
+        i_merged_len += i_line_len;
+    }
+    TextResetLine( txt );
+
+    return psz_merged;
+}
+
+static char * DetectCharset( text_t *txt ) {
+    uchardet_t ud = uchardet_new();
+
+    /* subtitles lines are merged because
+       uchardet's full-text result is better than line-by-line result */
+    char *psz_text = MergeTxtLines( txt );
+
+    uchardet_handle_data( ud, psz_text, strlen( psz_text ) );
+    uchardet_data_end( ud );
+
+    char *psz_detected_charset = (char *) uchardet_get_charset( ud );
+    if ( 0 == strcmp( psz_detected_charset, "" )
+         || 0 == strcmp (psz_detected_charset, "ASCII" ) ) {
+
+        psz_detected_charset = NULL;
+    } else {
+        /* uchardet's result will be freed on uchardet_delete() => strdup */
+        psz_detected_charset = strdup( psz_detected_charset );
+    }
+
+    uchardet_delete( ud );
+
+    return psz_detected_charset;
+}
diff --git a/modules/gui/macosx/UI/SimplePreferences.xib b/modules/gui/macosx/UI/SimplePreferences.xib
index 1366b78cc7..d8f4d94c00 100644
--- a/modules/gui/macosx/UI/SimplePreferences.xib
+++ b/modules/gui/macosx/UI/SimplePreferences.xib
@@ -84,6 +84,7 @@
                 <outlet property="osdView" destination="2523" id="d1o-FZ-hXa"/>
                 <outlet property="osd_encodingLabel" destination="2531" id="yyW-qD-zJ0"/>
                 <outlet property="osd_encodingPopup" destination="2532" id="mBO-m6-lIV"/>
+                <outlet property="osd_encodingAutoDetectCheckbox" destination="epz-hm-jAM" id="mAn-id-Mmz"/>
                 <outlet property="osd_fontBox" destination="2537" id="OCf-KO-i4B"/>
                 <outlet property="osd_fontButton" destination="2543" id="UOT-87-z16"/>
                 <outlet property="osd_fontLabel" destination="2542" id="LD1-g4-6Ex"/>
@@ -1532,7 +1533,7 @@ Gw
             <point key="canvasLocation" x="883" y="158"/>
         </customView>
         <customView translatesAutoresizingMaskIntoConstraints="NO" id="2523" userLabel="Subtitles & OSD Settings">
-            <rect key="frame" x="0.0" y="-1" width="571" height="402"/>
+            <rect key="frame" x="0.0" y="-1" width="571" height="428"/>
             <subviews>
                 <box title="Display Settings" translatesAutoresizingMaskIntoConstraints="NO" id="2537">
                     <rect key="frame" x="17" y="16" width="537" height="214"/>
@@ -1770,13 +1771,13 @@ Gw
                     </view>
                 </box>
                 <box title="On Screen Display" translatesAutoresizingMaskIntoConstraints="NO" id="2524">
-                    <rect key="frame" x="17" y="328" width="537" height="54"/>
+                    <rect key="frame" x="17" y="352" width="537" height="56"/>
                     <view key="contentView" id="GNq-vl-Idl">
-                        <rect key="frame" x="3" y="3" width="531" height="36"/>
+                        <rect key="frame" x="3" y="3" width="531" height="38"/>
                         <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
                         <subviews>
                             <button mirrorLayoutDirectionWhenInternationalizing="always" translatesAutoresizingMaskIntoConstraints="NO" id="2526">
-                                <rect key="frame" x="14" y="9" width="94" height="18"/>
+                                <rect key="frame" x="14" y="9" width="94" height="20"/>
                                 <buttonCell key="cell" type="check" title="Enable OSD" bezelStyle="regularSquare" imagePosition="left" alignment="left" inset="2" id="3494">
                                     <behavior key="behavior" changeContents="YES" doesNotDimImage="YES" lightByContents="YES"/>
                                     <font key="font" metaFont="system"/>
@@ -1795,13 +1796,13 @@ Gw
                     </view>
                 </box>
                 <box title="SPU language" translatesAutoresizingMaskIntoConstraints="NO" id="1aw-Yd-yzY">
-                    <rect key="frame" x="17" y="235" width="537" height="88"/>
+                    <rect key="frame" x="17" y="235" width="537" height="112"/>
                     <view key="contentView" id="clp-fm-5xs">
-                        <rect key="frame" x="3" y="3" width="531" height="70"/>
+                        <rect key="frame" x="3" y="3" width="531" height="94"/>
                         <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
                         <subviews>
                             <textField horizontalHuggingPriority="251" verticalHuggingPriority="750" translatesAutoresizingMaskIntoConstraints="NO" id="2529">
-                                <rect key="frame" x="16" y="43" width="175" height="17"/>
+                                <rect key="frame" x="16" y="67" width="175" height="17"/>
                                 <textFieldCell key="cell" lineBreakMode="truncatingTail" sendsActionOnEndEditing="YES" title="Preferred Subtitle Language" usesSingleLineMode="YES" id="3495">
                                     <font key="font" metaFont="system"/>
                                     <color key="textColor" name="controlTextColor" catalog="System" colorSpace="catalog"/>
@@ -1809,7 +1810,7 @@ Gw
                                 </textFieldCell>
                             </textField>
                             <textField verticalHuggingPriority="750" translatesAutoresizingMaskIntoConstraints="NO" id="2530">
-                                <rect key="frame" x="206" y="41" width="308" height="22"/>
+                                <rect key="frame" x="206" y="65" width="308" height="22"/>
                                 <textFieldCell key="cell" scrollable="YES" lineBreakMode="clipping" selectable="YES" editable="YES" continuous="YES" sendsActionOnEndEditing="YES" state="on" borderStyle="bezel" drawsBackground="YES" id="3496">
                                     <font key="font" metaFont="system"/>
                                     <color key="textColor" name="controlTextColor" catalog="System" colorSpace="catalog"/>
@@ -1821,7 +1822,7 @@ Gw
                                 </connections>
                             </textField>
                             <textField horizontalHuggingPriority="251" verticalHuggingPriority="750" translatesAutoresizingMaskIntoConstraints="NO" id="2531">
-                                <rect key="frame" x="16" y="14" width="175" height="17"/>
+                                <rect key="frame" x="16" y="38" width="175" height="17"/>
                                 <textFieldCell key="cell" lineBreakMode="truncatingTail" sendsActionOnEndEditing="YES" title="Default Encoding" usesSingleLineMode="YES" id="3497">
                                     <font key="font" metaFont="system"/>
                                     <color key="textColor" name="controlTextColor" catalog="System" colorSpace="catalog"/>
@@ -1829,7 +1830,7 @@ Gw
                                 </textFieldCell>
                             </textField>
                             <popUpButton verticalHuggingPriority="750" translatesAutoresizingMaskIntoConstraints="NO" id="2532">
-                                <rect key="frame" x="204" y="9" width="313" height="25"/>
+                                <rect key="frame" x="204" y="33" width="313" height="25"/>
                                 <popUpButtonCell key="cell" type="push" bezelStyle="rounded" lineBreakMode="truncatingTail" state="on" borderStyle="borderAndBezel" inset="2" arrowPosition="arrowAtCenter" preferredEdge="maxY" selectedItem="2665" id="3498">
                                     <behavior key="behavior" lightByBackground="YES" lightByGray="YES"/>
                                     <font key="font" metaFont="menu"/>
@@ -1848,15 +1849,27 @@ Gw
                                     <action selector="osdSettingChanged:" target="-2" id="xeg-0v-Ab2"/>
                                 </connections>
                             </popUpButton>
+                            <button verticalHuggingPriority="750" translatesAutoresizingMaskIntoConstraints="NO" id="epz-hm-jAM">
+                                <rect key="frame" x="16" y="10" width="193" height="18"/>
+                                <buttonCell key="cell" type="check" title="Try to auto-detect encoding" bezelStyle="regularSquare" imagePosition="left" state="on" inset="2" id="7Mx-so-6d6">
+                                    <behavior key="behavior" changeContents="YES" doesNotDimImage="YES" lightByContents="YES"/>
+                                    <font key="font" metaFont="system"/>
+                                </buttonCell>
+                                <connections>
+                                    <action selector="osdSettingChanged:" target="-2" id="mAn-id-Yn1"/>
+                                </connections>
+                            </button>
                         </subviews>
                         <constraints>
                             <constraint firstItem="2529" firstAttribute="centerY" secondItem="2530" secondAttribute="centerY" id="HsK-i9-n2z"/>
-                            <constraint firstAttribute="bottom" secondItem="2532" secondAttribute="bottom" constant="12" id="LX5-YD-7vT"/>
+                            <constraint firstAttribute="bottom" secondItem="epz-hm-jAM" secondAttribute="bottom" constant="12" id="LX5-YD-7vT"/>
                             <constraint firstAttribute="trailing" secondItem="2530" secondAttribute="trailing" constant="17" id="NTc-Wd-fKW"/>
                             <constraint firstItem="2529" firstAttribute="leading" secondItem="clp-fm-5xs" secondAttribute="leading" constant="18" id="Q4r-4X-sJQ"/>
                             <constraint firstItem="2530" firstAttribute="leading" secondItem="2532" secondAttribute="leading" id="R1y-rp-flP"/>
                             <constraint firstItem="2532" firstAttribute="leading" secondItem="2531" secondAttribute="trailing" constant="17" id="Upg-N7-CHN"/>
                             <constraint firstItem="2530" firstAttribute="width" secondItem="2532" secondAttribute="width" id="VfS-JB-gci"/>
+                            <constraint firstItem="epz-hm-jAM" firstAttribute="top" secondItem="2531" secondAttribute="bottom" constant="12" id="aws-1g-8oU"/>
+                            <constraint firstItem="epz-hm-jAM" firstAttribute="leading" secondItem="2531" secondAttribute="leading" id="dZ3-mh-SN1"/>
                             <constraint firstItem="2531" firstAttribute="centerY" secondItem="2532" secondAttribute="centerY" id="lPF-fI-ol6"/>
                             <constraint firstItem="2529" firstAttribute="leading" secondItem="2531" secondAttribute="leading" id="vCh-Cd-x9H"/>
                             <constraint firstItem="2529" firstAttribute="top" secondItem="clp-fm-5xs" secondAttribute="top" constant="10" id="vT7-e0-BpD"/>
diff --git a/modules/gui/macosx/preferences/VLCSimplePrefsController.h b/modules/gui/macosx/preferences/VLCSimplePrefsController.h
index b7d0b49141..1dc0e73fb0 100644
--- a/modules/gui/macosx/preferences/VLCSimplePrefsController.h
+++ b/modules/gui/macosx/preferences/VLCSimplePrefsController.h
@@ -157,6 +157,7 @@
 @property (readwrite, weak) IBOutlet NSButton *osd_forceboldCheckbox;
 @property (readwrite, weak) IBOutlet NSBox *osd_osdBox;
 @property (readwrite, weak) IBOutlet NSButton *osd_osdCheckbox;
+ at property (readwrite, weak) IBOutlet NSButton *osd_encodingAutoDetectCheckbox;
 
 // video pane
 @property (readwrite, strong) IBOutlet NSView *videoView;
diff --git a/modules/gui/macosx/preferences/VLCSimplePrefsController.m b/modules/gui/macosx/preferences/VLCSimplePrefsController.m
index 26d9589022..24295f1854 100644
--- a/modules/gui/macosx/preferences/VLCSimplePrefsController.m
+++ b/modules/gui/macosx/preferences/VLCSimplePrefsController.m
@@ -714,6 +714,7 @@ static inline const char * __config_GetLabel(vlc_object_t *p_this, const char *p
 
     [self setupButton:_osd_encodingPopup forStringList: "subsdec-encoding"];
     [self setupField:_osd_langTextField forOption: "sub-language" ];
+    [self setupButton:_osd_encodingAutoDetectCheckbox forBoolValue: "sub-autodetect-charset"];
 
     [self setupField:_osd_fontTextField forOption: "freetype-font"];
     [self setupButton:_osd_font_colorPopup forIntList: "freetype-color"];
@@ -1009,6 +1010,7 @@ static inline void save_string_list(intf_thread_t * p_intf, id object, const cha
             config_PutPsz("subsdec-encoding", "");
 
         config_PutPsz("sub-language", [[_osd_langTextField stringValue] UTF8String]);
+        config_PutInt("sub-autodetect-charset", [_osd_encodingAutoDetectCheckbox state]);
 
         config_PutPsz("freetype-font", [[_osd_fontTextField stringValue] UTF8String]);
         SaveIntList(_osd_font_colorPopup, "freetype-color");
diff --git a/modules/gui/qt/components/simple_preferences.cpp b/modules/gui/qt/components/simple_preferences.cpp
index d79fdf09f8..a35d2c7490 100644
--- a/modules/gui/qt/components/simple_preferences.cpp
+++ b/modules/gui/qt/components/simple_preferences.cpp
@@ -851,6 +851,7 @@ SPrefsPanel::SPrefsPanel( intf_thread_t *_p_intf, QWidget *_parent,
                             encoding );
             CONFIG_GENERIC( "sub-language", String, ui.subLangLabel,
                             preferredLanguage );
+            CONFIG_BOOL( "sub-autodetect-charset", subDetectCharset );
 
             CONFIG_GENERIC( "freetype-rel-fontsize", IntegerList,
                             ui.fontSizeLabel, fontSize );
diff --git a/modules/gui/qt/ui/sprefs_subtitles.ui b/modules/gui/qt/ui/sprefs_subtitles.ui
index b9e851dc9d..12ab2d56d5 100644
--- a/modules/gui/qt/ui/sprefs_subtitles.ui
+++ b/modules/gui/qt/ui/sprefs_subtitles.ui
@@ -136,6 +136,13 @@
            </property>
           </widget>
          </item>
+         <item row="2" column="0">
+           <widget class="QCheckBox" name="subDetectCharset">
+             <property name="text">
+               <string>Try to auto-detect encoding</string>
+             </property>
+           </widget>
+         </item>
         </layout>
        </widget>
       </item>
-- 
2.20.1



More information about the vlc-devel mailing list