[vlc-devel] [PATCH 1/1] Add subtitle encoding detection with uchardet
pertuleha at gmail.com
pertuleha at gmail.com
Thu Apr 4 21:01:48 CEST 2019
From: Aleksei Pertu <pertuleha at gmail.com>
Detect subtitle encoding, using uchardet library
Add UI setting for QT/macOS simple preferences
---
configure.ac | 10 +++
contrib/src/uchardet/SHA512SUMS | 1 +
contrib/src/uchardet/rules.mak | 25 ++++++
extras/package/macosx/env.build.sh | 8 +-
modules/demux/Makefile.am | 2 +-
modules/demux/subtitle.c | 83 ++++++++++++++++++-
modules/gui/macosx/UI/SimplePreferences.xib | 35 +++++---
.../preferences/VLCSimplePrefsController.h | 1 +
.../preferences/VLCSimplePrefsController.m | 2 +
.../gui/qt/components/simple_preferences.cpp | 1 +
modules/gui/qt/ui/sprefs_subtitles.ui | 7 ++
11 files changed, 160 insertions(+), 15 deletions(-)
create mode 100644 contrib/src/uchardet/SHA512SUMS
create mode 100644 contrib/src/uchardet/rules.mak
diff --git a/configure.ac b/configure.ac
index adf61a1929..349a76c835 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4002,6 +4002,16 @@ AS_IF([test "${enable_skins2}" = "yes" && test "${enable_libtar}" != "no"], [
])
])
+dnl
+dnl uchardet library for character encoding detection
+dnl
+AC_CHECK_HEADER([uchardet/uchardet.h], [ have_uchardet=yes ], [ have_uchardet=no ])
+AM_CONDITIONAL([HAVE_UCHARDET], [ test "${have_uchardet}" = "yes" ])
+if test "${have_uchardet}" = "yes"
+then
+ VLC_ADD_LIBS([uchardet], [-luchardet -lstdc++])
+fi
+
dnl
dnl MacOS X gui module
dnl
diff --git a/contrib/src/uchardet/SHA512SUMS b/contrib/src/uchardet/SHA512SUMS
new file mode 100644
index 0000000000..5cbb1c2697
--- /dev/null
+++ b/contrib/src/uchardet/SHA512SUMS
@@ -0,0 +1 @@
+eceeadae060bf277e298d709856609dde32921271140dc1fb0a33c7b6e1381033fc2960d616ebbd82c92815936864d2c0743b1b5ea1b7d4a200df87df80d6de5 uchardet-0.0.6.tar.xz
diff --git a/contrib/src/uchardet/rules.mak b/contrib/src/uchardet/rules.mak
new file mode 100644
index 0000000000..5dd302be3a
--- /dev/null
+++ b/contrib/src/uchardet/rules.mak
@@ -0,0 +1,25 @@
+# uchardet
+
+UCHARDET_VERSION := 0.0.6
+UCHARDET_URL := https://www.freedesktop.org/software/uchardet/releases/uchardet-$(UCHARDET_VERSION).tar.xz
+
+PKGS += uchardet
+ifeq ($(call need_pkg,"uchardet >= 0.0.6"),)
+PKGS_FOUND += uchardet
+endif
+
+$(TARBALLS)/uchardet-$(UCHARDET_VERSION).tar.xz:
+ $(call download_pkg,$(UCHARDET_URL),uchardet)
+
+.sum-uchardet: uchardet-$(UCHARDET_VERSION).tar.xz
+
+uchardet: uchardet-$(UCHARDET_VERSION).tar.xz .sum-uchardet
+ $(UNPACK)
+ $(MOVE)
+
+.uchardet: uchardet toolchain.cmake
+ cd $< && $(HOSTVARS_PIC) $(CMAKE) \
+ -DBUILD_SHARED_LIBS:BOOL=OFF \
+ .
+ cd $< && $(MAKE) install
+ touch $@
diff --git a/extras/package/macosx/env.build.sh b/extras/package/macosx/env.build.sh
index ee422eecb3..1c0ae62c5c 100755
--- a/extras/package/macosx/env.build.sh
+++ b/extras/package/macosx/env.build.sh
@@ -42,7 +42,13 @@ vlcSetBaseEnvironment() {
python3Path=$(echo /Library/Frameworks/Python.framework/Versions/3.*/bin | awk '{print $1;}')
if [ ! -d "$python3Path" ]; then
- python3Path=""
+ # in case of HomeBrew or manually installed python3
+ python3ExePath=$(which python3)
+ if [ -n "$python3ExePath" ]; then
+ python3Path=$(dirname "$python3ExePath")
+ else
+ python3Path=""
+ fi;
fi
export PATH="${VLC_ROOT_DIR}/extras/tools/build/bin:${VLC_ROOT_DIR}/contrib/${LOCAL_TRIPLET}/bin:$python3Path:${VLC_PATH}:/bin:/sbin:/usr/bin:/usr/sbin"
diff --git a/modules/demux/Makefile.am b/modules/demux/Makefile.am
index 85d4b3fba9..54302049b0 100644
--- a/modules/demux/Makefile.am
+++ b/modules/demux/Makefile.am
@@ -61,7 +61,7 @@ libmjpeg_plugin_la_SOURCES = demux/mjpeg.c demux/mxpeg_helper.h
demux_LTLIBRARIES += libmjpeg_plugin.la
libsubtitle_plugin_la_SOURCES = demux/subtitle.c
-libsubtitle_plugin_la_LIBADD = $(LIBM)
+libsubtitle_plugin_la_LIBADD = $(LIBS_uchardet) $(LIBM)
demux_LTLIBRARIES += libsubtitle_plugin.la
libty_plugin_la_SOURCES = demux/ty.c codec/cc.h \
diff --git a/modules/demux/subtitle.c b/modules/demux/subtitle.c
index c715402da4..6a95aa8a43 100644
--- a/modules/demux/subtitle.c
+++ b/modules/demux/subtitle.c
@@ -41,6 +41,8 @@
#include <vlc_demux.h>
#include <vlc_charset.h>
+#include <uchardet/uchardet.h>
+
/*****************************************************************************
* Module descriptor
*****************************************************************************/
@@ -51,6 +53,8 @@ static void Close( vlc_object_t *p_this );
N_("Force the subtiles format. Selecting \"auto\" means autodetection and should always work.")
#define SUB_DESCRIPTION_LONGTEXT \
N_("Override the default track description.")
+#define SUB_DETECT_CHARSET_LONGTEXT \
+ N_("Try to auto-detect subtitles character encoding (with heuristic).")
static const char *const ppsz_sub_type[] =
{
@@ -71,6 +75,8 @@ vlc_module_begin ()
change_string_list( ppsz_sub_type, ppsz_sub_type )
add_string( "sub-description", NULL, N_("Subtitle description"),
SUB_DESCRIPTION_LONGTEXT, true )
+ add_bool( "sub-autodetect-charset", true, N_("Auto-detect subtitles encoding"),
+ SUB_DETECT_CHARSET_LONGTEXT, true );
set_callbacks( Open, Close )
add_shortcut( "subtitle" )
@@ -237,6 +243,8 @@ static int Control( demux_t *, int, va_list );
static void Fix( demux_t * );
static char * get_language_from_filename( const char * );
+static char * DetectCharset( text_t *txt );
+
/*****************************************************************************
* Decoder format output function
*****************************************************************************/
@@ -661,8 +669,6 @@ static int Open ( vlc_object_t *p_this )
p_sys->subtitles.i_count++;
}
- /* Unload */
- TextUnload( &txtlines );
msg_Dbg(p_demux, "loaded %zu subtitles", p_sys->subtitles.i_count );
@@ -682,6 +688,21 @@ static int Open ( vlc_object_t *p_this )
else
es_format_Init( &fmt, SPU_ES, VLC_CODEC_SUBT );
+ /* Try to detect subtitles charset */
+ bool b_charset_autodetection = var_InheritBool( p_demux, "sub-autodetect-charset" );
+ if ( b_charset_autodetection && NULL == fmt.subs.psz_encoding ) {
+ char *charset = DetectCharset( &txtlines );
+ if ( NULL != charset ) {
+ msg_Info( p_demux, "auto-detected charset: %s", charset );
+ fmt.subs.psz_encoding = charset;
+ } else {
+ msg_Info( p_demux, "charset auto-detection failed" );
+ }
+ }
+
+ /* Unload subtitles text */
+ TextUnload( &txtlines );
+
p_sys->subtitles.i_current = 0;
p_sys->i_length = 0;
if( p_sys->subtitles.i_count > 0 )
@@ -944,6 +965,7 @@ static int TextLoad( text_t *txt, stream_t *s )
return VLC_SUCCESS;
}
+
static void TextUnload( text_t *txt )
{
if( txt->i_line_count )
@@ -963,12 +985,17 @@ static char *TextGetLine( text_t *txt )
return txt->line[txt->i_line++];
}
+
static void TextPreviousLine( text_t *txt )
{
if( txt->i_line > 0 )
txt->i_line--;
}
+static void TextResetLine( text_t *txt ) {
+ txt->i_line = 0;
+}
+
/*****************************************************************************
* Specific Subtitle function
*****************************************************************************/
@@ -2441,3 +2468,55 @@ static char * get_language_from_filename( const char * psz_sub_file )
free( psz_work );
return psz_ret;
}
+
+static char * MergeTxtLines( text_t *txt ) {
+ char *psz_merged = malloc( 1 );
+ size_t i_merged_len = 0;
+ psz_merged[i_merged_len] = '\0';
+
+ TextResetLine( txt );
+ for ( char *psz_line = TextGetLine( txt );
+ NULL != psz_line;
+ psz_line = TextGetLine( txt ) ) {
+
+ size_t i_line_len = strlen( psz_line );
+
+ psz_merged = realloc( psz_merged, i_merged_len + i_line_len + 1 );
+ if ( NULL == psz_merged ) {
+ return NULL;
+ }
+
+ /* strcat( (dst + dst_len), src ) instead of simple strcat( dst, src )
+ optimizes text concat to O(N) instead of O(N^2) */
+ strcat( (psz_merged + i_merged_len), psz_line );
+ i_merged_len += i_line_len;
+ }
+ TextResetLine( txt );
+
+ return psz_merged;
+}
+
+static char * DetectCharset( text_t *txt ) {
+ uchardet_t ud = uchardet_new();
+
+ /* subtitles lines are merged because
+ uchardet's full-text result is better than line-by-line result */
+ char *psz_text = MergeTxtLines( txt );
+
+ uchardet_handle_data( ud, psz_text, strlen( psz_text ) );
+ uchardet_data_end( ud );
+
+ char *psz_detected_charset = (char *) uchardet_get_charset( ud );
+ if ( 0 == strcmp( psz_detected_charset, "" )
+ || 0 == strcmp (psz_detected_charset, "ASCII" ) ) {
+
+ psz_detected_charset = NULL;
+ } else {
+ /* uchardet's result will be freed on uchardet_delete() => strdup */
+ psz_detected_charset = strdup( psz_detected_charset );
+ }
+
+ uchardet_delete( ud );
+
+ return psz_detected_charset;
+}
diff --git a/modules/gui/macosx/UI/SimplePreferences.xib b/modules/gui/macosx/UI/SimplePreferences.xib
index 1366b78cc7..d8f4d94c00 100644
--- a/modules/gui/macosx/UI/SimplePreferences.xib
+++ b/modules/gui/macosx/UI/SimplePreferences.xib
@@ -84,6 +84,7 @@
<outlet property="osdView" destination="2523" id="d1o-FZ-hXa"/>
<outlet property="osd_encodingLabel" destination="2531" id="yyW-qD-zJ0"/>
<outlet property="osd_encodingPopup" destination="2532" id="mBO-m6-lIV"/>
+ <outlet property="osd_encodingAutoDetectCheckbox" destination="epz-hm-jAM" id="mAn-id-Mmz"/>
<outlet property="osd_fontBox" destination="2537" id="OCf-KO-i4B"/>
<outlet property="osd_fontButton" destination="2543" id="UOT-87-z16"/>
<outlet property="osd_fontLabel" destination="2542" id="LD1-g4-6Ex"/>
@@ -1532,7 +1533,7 @@ Gw
<point key="canvasLocation" x="883" y="158"/>
</customView>
<customView translatesAutoresizingMaskIntoConstraints="NO" id="2523" userLabel="Subtitles & OSD Settings">
- <rect key="frame" x="0.0" y="-1" width="571" height="402"/>
+ <rect key="frame" x="0.0" y="-1" width="571" height="428"/>
<subviews>
<box title="Display Settings" translatesAutoresizingMaskIntoConstraints="NO" id="2537">
<rect key="frame" x="17" y="16" width="537" height="214"/>
@@ -1770,13 +1771,13 @@ Gw
</view>
</box>
<box title="On Screen Display" translatesAutoresizingMaskIntoConstraints="NO" id="2524">
- <rect key="frame" x="17" y="328" width="537" height="54"/>
+ <rect key="frame" x="17" y="352" width="537" height="56"/>
<view key="contentView" id="GNq-vl-Idl">
- <rect key="frame" x="3" y="3" width="531" height="36"/>
+ <rect key="frame" x="3" y="3" width="531" height="38"/>
<autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
<subviews>
<button mirrorLayoutDirectionWhenInternationalizing="always" translatesAutoresizingMaskIntoConstraints="NO" id="2526">
- <rect key="frame" x="14" y="9" width="94" height="18"/>
+ <rect key="frame" x="14" y="9" width="94" height="20"/>
<buttonCell key="cell" type="check" title="Enable OSD" bezelStyle="regularSquare" imagePosition="left" alignment="left" inset="2" id="3494">
<behavior key="behavior" changeContents="YES" doesNotDimImage="YES" lightByContents="YES"/>
<font key="font" metaFont="system"/>
@@ -1795,13 +1796,13 @@ Gw
</view>
</box>
<box title="SPU language" translatesAutoresizingMaskIntoConstraints="NO" id="1aw-Yd-yzY">
- <rect key="frame" x="17" y="235" width="537" height="88"/>
+ <rect key="frame" x="17" y="235" width="537" height="112"/>
<view key="contentView" id="clp-fm-5xs">
- <rect key="frame" x="3" y="3" width="531" height="70"/>
+ <rect key="frame" x="3" y="3" width="531" height="94"/>
<autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
<subviews>
<textField horizontalHuggingPriority="251" verticalHuggingPriority="750" translatesAutoresizingMaskIntoConstraints="NO" id="2529">
- <rect key="frame" x="16" y="43" width="175" height="17"/>
+ <rect key="frame" x="16" y="67" width="175" height="17"/>
<textFieldCell key="cell" lineBreakMode="truncatingTail" sendsActionOnEndEditing="YES" title="Preferred Subtitle Language" usesSingleLineMode="YES" id="3495">
<font key="font" metaFont="system"/>
<color key="textColor" name="controlTextColor" catalog="System" colorSpace="catalog"/>
@@ -1809,7 +1810,7 @@ Gw
</textFieldCell>
</textField>
<textField verticalHuggingPriority="750" translatesAutoresizingMaskIntoConstraints="NO" id="2530">
- <rect key="frame" x="206" y="41" width="308" height="22"/>
+ <rect key="frame" x="206" y="65" width="308" height="22"/>
<textFieldCell key="cell" scrollable="YES" lineBreakMode="clipping" selectable="YES" editable="YES" continuous="YES" sendsActionOnEndEditing="YES" state="on" borderStyle="bezel" drawsBackground="YES" id="3496">
<font key="font" metaFont="system"/>
<color key="textColor" name="controlTextColor" catalog="System" colorSpace="catalog"/>
@@ -1821,7 +1822,7 @@ Gw
</connections>
</textField>
<textField horizontalHuggingPriority="251" verticalHuggingPriority="750" translatesAutoresizingMaskIntoConstraints="NO" id="2531">
- <rect key="frame" x="16" y="14" width="175" height="17"/>
+ <rect key="frame" x="16" y="38" width="175" height="17"/>
<textFieldCell key="cell" lineBreakMode="truncatingTail" sendsActionOnEndEditing="YES" title="Default Encoding" usesSingleLineMode="YES" id="3497">
<font key="font" metaFont="system"/>
<color key="textColor" name="controlTextColor" catalog="System" colorSpace="catalog"/>
@@ -1829,7 +1830,7 @@ Gw
</textFieldCell>
</textField>
<popUpButton verticalHuggingPriority="750" translatesAutoresizingMaskIntoConstraints="NO" id="2532">
- <rect key="frame" x="204" y="9" width="313" height="25"/>
+ <rect key="frame" x="204" y="33" width="313" height="25"/>
<popUpButtonCell key="cell" type="push" bezelStyle="rounded" lineBreakMode="truncatingTail" state="on" borderStyle="borderAndBezel" inset="2" arrowPosition="arrowAtCenter" preferredEdge="maxY" selectedItem="2665" id="3498">
<behavior key="behavior" lightByBackground="YES" lightByGray="YES"/>
<font key="font" metaFont="menu"/>
@@ -1848,15 +1849,27 @@ Gw
<action selector="osdSettingChanged:" target="-2" id="xeg-0v-Ab2"/>
</connections>
</popUpButton>
+ <button verticalHuggingPriority="750" translatesAutoresizingMaskIntoConstraints="NO" id="epz-hm-jAM">
+ <rect key="frame" x="16" y="10" width="193" height="18"/>
+ <buttonCell key="cell" type="check" title="Try to auto-detect encoding" bezelStyle="regularSquare" imagePosition="left" state="on" inset="2" id="7Mx-so-6d6">
+ <behavior key="behavior" changeContents="YES" doesNotDimImage="YES" lightByContents="YES"/>
+ <font key="font" metaFont="system"/>
+ </buttonCell>
+ <connections>
+ <action selector="osdSettingChanged:" target="-2" id="mAn-id-Yn1"/>
+ </connections>
+ </button>
</subviews>
<constraints>
<constraint firstItem="2529" firstAttribute="centerY" secondItem="2530" secondAttribute="centerY" id="HsK-i9-n2z"/>
- <constraint firstAttribute="bottom" secondItem="2532" secondAttribute="bottom" constant="12" id="LX5-YD-7vT"/>
+ <constraint firstAttribute="bottom" secondItem="epz-hm-jAM" secondAttribute="bottom" constant="12" id="LX5-YD-7vT"/>
<constraint firstAttribute="trailing" secondItem="2530" secondAttribute="trailing" constant="17" id="NTc-Wd-fKW"/>
<constraint firstItem="2529" firstAttribute="leading" secondItem="clp-fm-5xs" secondAttribute="leading" constant="18" id="Q4r-4X-sJQ"/>
<constraint firstItem="2530" firstAttribute="leading" secondItem="2532" secondAttribute="leading" id="R1y-rp-flP"/>
<constraint firstItem="2532" firstAttribute="leading" secondItem="2531" secondAttribute="trailing" constant="17" id="Upg-N7-CHN"/>
<constraint firstItem="2530" firstAttribute="width" secondItem="2532" secondAttribute="width" id="VfS-JB-gci"/>
+ <constraint firstItem="epz-hm-jAM" firstAttribute="top" secondItem="2531" secondAttribute="bottom" constant="12" id="aws-1g-8oU"/>
+ <constraint firstItem="epz-hm-jAM" firstAttribute="leading" secondItem="2531" secondAttribute="leading" id="dZ3-mh-SN1"/>
<constraint firstItem="2531" firstAttribute="centerY" secondItem="2532" secondAttribute="centerY" id="lPF-fI-ol6"/>
<constraint firstItem="2529" firstAttribute="leading" secondItem="2531" secondAttribute="leading" id="vCh-Cd-x9H"/>
<constraint firstItem="2529" firstAttribute="top" secondItem="clp-fm-5xs" secondAttribute="top" constant="10" id="vT7-e0-BpD"/>
diff --git a/modules/gui/macosx/preferences/VLCSimplePrefsController.h b/modules/gui/macosx/preferences/VLCSimplePrefsController.h
index b7d0b49141..1dc0e73fb0 100644
--- a/modules/gui/macosx/preferences/VLCSimplePrefsController.h
+++ b/modules/gui/macosx/preferences/VLCSimplePrefsController.h
@@ -157,6 +157,7 @@
@property (readwrite, weak) IBOutlet NSButton *osd_forceboldCheckbox;
@property (readwrite, weak) IBOutlet NSBox *osd_osdBox;
@property (readwrite, weak) IBOutlet NSButton *osd_osdCheckbox;
+ at property (readwrite, weak) IBOutlet NSButton *osd_encodingAutoDetectCheckbox;
// video pane
@property (readwrite, strong) IBOutlet NSView *videoView;
diff --git a/modules/gui/macosx/preferences/VLCSimplePrefsController.m b/modules/gui/macosx/preferences/VLCSimplePrefsController.m
index 26d9589022..24295f1854 100644
--- a/modules/gui/macosx/preferences/VLCSimplePrefsController.m
+++ b/modules/gui/macosx/preferences/VLCSimplePrefsController.m
@@ -714,6 +714,7 @@ static inline const char * __config_GetLabel(vlc_object_t *p_this, const char *p
[self setupButton:_osd_encodingPopup forStringList: "subsdec-encoding"];
[self setupField:_osd_langTextField forOption: "sub-language" ];
+ [self setupButton:_osd_encodingAutoDetectCheckbox forBoolValue: "sub-autodetect-charset"];
[self setupField:_osd_fontTextField forOption: "freetype-font"];
[self setupButton:_osd_font_colorPopup forIntList: "freetype-color"];
@@ -1009,6 +1010,7 @@ static inline void save_string_list(intf_thread_t * p_intf, id object, const cha
config_PutPsz("subsdec-encoding", "");
config_PutPsz("sub-language", [[_osd_langTextField stringValue] UTF8String]);
+ config_PutInt("sub-autodetect-charset", [_osd_encodingAutoDetectCheckbox state]);
config_PutPsz("freetype-font", [[_osd_fontTextField stringValue] UTF8String]);
SaveIntList(_osd_font_colorPopup, "freetype-color");
diff --git a/modules/gui/qt/components/simple_preferences.cpp b/modules/gui/qt/components/simple_preferences.cpp
index d79fdf09f8..a35d2c7490 100644
--- a/modules/gui/qt/components/simple_preferences.cpp
+++ b/modules/gui/qt/components/simple_preferences.cpp
@@ -851,6 +851,7 @@ SPrefsPanel::SPrefsPanel( intf_thread_t *_p_intf, QWidget *_parent,
encoding );
CONFIG_GENERIC( "sub-language", String, ui.subLangLabel,
preferredLanguage );
+ CONFIG_BOOL( "sub-autodetect-charset", subDetectCharset );
CONFIG_GENERIC( "freetype-rel-fontsize", IntegerList,
ui.fontSizeLabel, fontSize );
diff --git a/modules/gui/qt/ui/sprefs_subtitles.ui b/modules/gui/qt/ui/sprefs_subtitles.ui
index b9e851dc9d..12ab2d56d5 100644
--- a/modules/gui/qt/ui/sprefs_subtitles.ui
+++ b/modules/gui/qt/ui/sprefs_subtitles.ui
@@ -136,6 +136,13 @@
</property>
</widget>
</item>
+ <item row="2" column="0">
+ <widget class="QCheckBox" name="subDetectCharset">
+ <property name="text">
+ <string>Try to auto-detect encoding</string>
+ </property>
+ </widget>
+ </item>
</layout>
</widget>
</item>
--
2.20.1
More information about the vlc-devel
mailing list