[vlc-commits] Add a SAPI synthetizer for Windows

Sun Oct 11 21:49:33 CEST 2015

vlc | branch: master | Jean-Baptiste Kempf <jb at videolan.org> | Sun Oct 11 18:26:11 2015 +0200| [a7eb0f0aa5c3003535a091364160c918db97c4ec] | committer: Jean-Baptiste Kempf

Add a SAPI synthetizer for Windows

This is the work from Moti Zilberman, modified by me to build and
integrate in-tree

It's heavily inspired on the OS X one.

Be careful: you need a very recent Mingw-W64 to hope to compile it.

Ref #11893

> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=a7eb0f0aa5c3003535a091364160c918db97c4ec
---

 NEWS                              |    1 +
 modules/MODULES_LIST              |    1 +
 modules/text_renderer/Makefile.am |    6 +
 modules/text_renderer/sapi.cpp    |  236 +++++++++++++++++++++++++++++++++++++
 po/POTFILES.in                    |    1 +
 5 files changed, 245 insertions(+)

diff --git a/NEWS b/NEWS
index 2023bc9..16ea9e3 100644
--- a/NEWS
+++ b/NEWS
@@ -163,6 +163,7 @@ Misc
  * remove ZPL playlist format
  * Update libVLC doxygen modules
  * Add a text-to-speech renderer for subtitles on OS X/iOS
+ * Add a text-to-speech renderer for subtitles on Windows
 
 Removed modules
  * Atmo video filter
diff --git a/modules/MODULES_LIST b/modules/MODULES_LIST
index 472ad05..a34dc6e 100644
--- a/modules/MODULES_LIST
+++ b/modules/MODULES_LIST
@@ -319,6 +319,7 @@ $Id$
  * rv32: RV32 image format conversion module
  * samplerate: Secret Rabbit Code (libsamplerate) audio resampler
  * sap: Interface module to read SAP/SDP announcements
+ * sapi: Windows Text to Speech Synthetizer using the SAPI 5.1 API
  * scale: Images rescaler
  * scaletempo: Scale audio tempo in sync with playback rate
  * scene: scene video filter
diff --git a/modules/text_renderer/Makefile.am b/modules/text_renderer/Makefile.am
index 43ccb67..7634a4f 100644
--- a/modules/text_renderer/Makefile.am
+++ b/modules/text_renderer/Makefile.am
@@ -42,6 +42,12 @@ libnsspeechsynthesizer_plugin_la_LDFLAGS = -Wl,-framework,Cocoa
 text_LTLIBRARIES += libnsspeechsynthesizer_plugin.la
 endif
 
+libsapi_plugin_la_SOURCES = text_renderer/sapi.cpp
+libsapi_plugin_la_LIBADD = -lole32
+if HAVE_WIN32
+text_LTLIBRARIES += libsapi_plugin.la
+endif
+
 libsvg_plugin_la_SOURCES = text_renderer/svg.c
 libsvg_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) $(SVG_CFLAGS)
 libsvg_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(textdir)'
diff --git a/modules/text_renderer/sapi.cpp b/modules/text_renderer/sapi.cpp
new file mode 100644
index 0000000..5df8dcf
--- /dev/null
+++ b/modules/text_renderer/sapi.cpp
@@ -0,0 +1,236 @@
+/*****************************************************************************
+ * sapi.cpp: Simple text to Speech renderer for Windows, based on SAPI
+ *****************************************************************************
+ * Copyright (c) 2015 Moti Zilberman
+ *
+ * Authors: Moti Zilberman
+ *          Jean-Baptiste Kempf
+ *
+ * The MIT License (MIT)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *****************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+/* VLC core API headers */
+#include <vlc_common.h>
+#include <vlc_plugin.h>
+#include <vlc_filter.h>
+#include <vlc_charset.h>
+
+#define INITGUID
+
+#include <windows.h>
+#include <sapi.h>
+#include <sphelper.h>
+
+static int Create (vlc_object_t *);
+static void Destroy(vlc_object_t *);
+static int RenderText(filter_t *,
+                      subpicture_region_t *,
+                      subpicture_region_t *,
+                      const vlc_fourcc_t *);
+
+vlc_module_begin ()
+ set_description(N_("Speech synthesis for Windows"))
+
+ set_category(CAT_VIDEO)
+ set_subcategory(SUBCAT_VIDEO_SUBPIC)
+
+ set_capability("text renderer", 0)
+ set_callbacks(Create, Destroy)
+ add_integer("sapi-voice", -1, "Voice Index", "Voice index", false)
+vlc_module_end ()
+
+struct filter_sys_t
+{
+    ISpVoice* cpVoice;
+    char* lastString;
+};
+
+/* MTA functions */
+static int TryEnterMTA(vlc_object_t *obj)
+{
+    HRESULT hr = CoInitializeEx(nullptr, COINIT_MULTITHREADED);
+    if (unlikely(FAILED(hr)))
+    {
+        msg_Err (obj, "cannot initialize COM (error 0x%lx)", hr);
+        return -1;
+    }
+    return 0;
+}
+#define TryEnterMTA(o) TryEnterMTA(VLC_OBJECT(o))
+
+static void EnterMTA(void)
+{
+    HRESULT hr = CoInitializeEx(nullptr, COINIT_MULTITHREADED);
+    if (unlikely(FAILED(hr)))
+        abort();
+}
+
+static void LeaveMTA(void)
+{
+    CoUninitialize();
+}
+
+static int Create (vlc_object_t *p_this)
+{
+    filter_t *p_filter = (filter_t *)p_this;
+    filter_sys_t *p_sys;
+    HRESULT hr;
+
+    if (TryEnterMTA(p_this))
+        return VLC_EGENERIC;
+
+    p_filter->p_sys = p_sys = (filter_sys_t*) malloc(sizeof(filter_sys_t));
+    if (!p_sys)
+        goto error;
+
+    p_sys->cpVoice = nullptr;
+    p_sys->lastString = nullptr;
+
+    hr = CoCreateInstance(CLSID_SpVoice, nullptr, CLSCTX_INPROC_SERVER, IID_ISpVoice, (void**) &p_sys->cpVoice);
+    if (SUCCEEDED(hr)) {
+        ISpObjectToken*        cpVoiceToken = nullptr;
+        IEnumSpObjectTokens*   cpEnum = nullptr;
+        ULONG ulCount = 0;
+
+        hr = SpEnumTokens(SPCAT_VOICES, nullptr, nullptr, &cpEnum);
+        if (SUCCEEDED(hr))
+        {
+            // Get the number of voices.
+            hr = cpEnum->GetCount(&ulCount);
+            if (SUCCEEDED (hr))
+            {
+                int voiceIndex = var_InheritInteger(p_this, "sapi-voice");
+                if (voiceIndex > -1)
+                {
+                    if ((unsigned)voiceIndex <= ulCount) {
+                        hr = cpEnum->Item(voiceIndex, &cpVoiceToken);
+                        if (SUCCEEDED(hr)) {
+                            hr = p_sys->cpVoice->SetVoice(cpVoiceToken);
+                            if (SUCCEEDED(hr)) {
+                                msg_Dbg(p_this, "Selected voice %d", voiceIndex);
+                            }
+                            else {
+                                msg_Err(p_this, "Failed to set voice %d", voiceIndex);
+                            }
+                            cpVoiceToken->Release();
+                            cpVoiceToken = nullptr;
+                        }
+                    }
+                    else
+                        msg_Err(p_this, "Voice index exceeds available count");
+                }
+            }
+            cpEnum->Release();
+            cpEnum = nullptr;
+
+            /* Set Output */
+            hr = p_sys->cpVoice->SetOutput(nullptr, TRUE);
+        }
+    }
+    else
+    {
+        msg_Err(p_filter, "Could not create SpVoice");
+        goto error;
+    }
+
+    LeaveMTA();
+
+    p_filter->pf_render = RenderText;
+
+    return VLC_SUCCESS;
+
+error:
+    LeaveMTA();
+    free(p_sys);
+    return VLC_EGENERIC;
+}
+
+static void Destroy(vlc_object_t *p_this)
+{
+    filter_t *p_filter = (filter_t *)p_this;
+    filter_sys_t *p_sys = p_filter->p_sys;
+
+    if (p_sys->cpVoice) {
+        p_sys->cpVoice->Release();
+        p_sys->cpVoice = nullptr;
+    }
+
+    if (p_sys->lastString) {
+        free(p_sys->lastString);
+        p_sys->lastString = nullptr;
+    }
+
+    free(p_sys);
+}
+
+static int RenderText(filter_t *p_filter,
+        subpicture_region_t *p_region_out,
+        subpicture_region_t *p_region_in,
+        const vlc_fourcc_t *p_chroma_list)
+{
+    VLC_UNUSED(p_region_out);
+    VLC_UNUSED(p_chroma_list);
+
+    filter_sys_t *p_sys = p_filter->p_sys;
+    text_segment_t *p_segment = p_region_in->p_text;
+
+    if (!p_segment)
+        return VLC_EGENERIC;
+
+    for (const text_segment_t *s = p_segment; s != nullptr; s = s->p_next ) {
+        if (!s->psz_text )
+            continue;
+
+        if (strlen(s->psz_text) == 0)
+            continue;
+
+        try {
+            if (p_sys->lastString && !strcmp(p_sys->lastString, s->psz_text))
+                continue;
+
+            if (!strcmp(s->psz_text, "\n"))
+                continue;
+
+            p_sys->lastString = strdup(s->psz_text);
+            if (p_sys->lastString) {
+                msg_Dbg(p_filter, "Speaking '%s'", s->psz_text);
+
+                EnterMTA();
+                wchar_t* wideText = ToWide(s->psz_text);
+                HRESULT hr = p_sys->cpVoice->Speak(wideText, SPF_ASYNC, nullptr);
+                free(wideText);
+                if (!SUCCEEDED(hr)) {
+                    msg_Err(p_filter, "Speak() error");
+                }
+                LeaveMTA();
+            }
+        }
+        catch (...) {
+            msg_Err(p_filter, "Caught an exception!");
+        }
+    }
+
+    return VLC_SUCCESS;
+}
diff --git a/po/POTFILES.in b/po/POTFILES.in
index 49e4d25..8077a0c 100644
--- a/po/POTFILES.in
+++ b/po/POTFILES.in
@@ -1063,6 +1063,7 @@ modules/stream_out/transcode/transcode.c
 modules/text_renderer/freetype.c
 modules/text_renderer/nsspeechsynthesizer.m
 modules/text_renderer/quartztext.c
+modules/text_renderer/sapi.cpp
 modules/text_renderer/svg.c
 modules/text_renderer/tdummy.c
 modules/text_renderer/win32text.c