// Copyright (c) Microsoft Corporation. // Licensed under the MIT License. using System; using UnityEngine; #if WINDOWS_UWP using Windows.Foundation; using Windows.Media.SpeechSynthesis; using Windows.Storage.Streams; using System.Linq; using System.Threading.Tasks; #endif namespace Microsoft.MixedReality.Toolkit.Audio { /// /// The en-US voices that can be used by . Voices for all other locales are categorized as Other. /// public enum TextToSpeechVoice { /// /// The default system voice. /// Default, /// /// Microsoft David voice /// David, /// /// Microsoft Mark voice /// Mark, /// /// Microsoft Zira voice /// Zira, /// /// Voice not listed above (for non en-US languages) /// Other } /// /// Enables text to speech using the Windows 10 SpeechSynthesizer class. /// /// /// SpeechSynthesizer generates speech as a SpeechSynthesisStream. /// This class converts that stream into a Unity AudioClip and plays the clip using /// the you supply in the inspector. This allows you to position the voice /// as desired in 3D space. One recommended approach is to place the AudioSource on an empty /// GameObject that is a child of Main Camera and position it approximately 0.6 units above the /// camera. This orientation will sound similar to Cortana's speech in the OS. /// [RequireComponent(typeof(AudioSource))] [AddComponentMenu("Scripts/MRTK/SDK/TextToSpeech")] public class TextToSpeech : MonoBehaviour { [Tooltip("The audio source where speech will be played.")] [SerializeField] private AudioSource audioSource; /// /// Gets or sets the audio source where speech will be played. /// public AudioSource AudioSource { get { return audioSource; } set { audioSource = value; } } /// /// Gets or sets the voice that will be used to generate speech. To use a non en-US voice, set this to Other. /// /// /// If a custom voice is desired (i.e. this enum is being set to Other) make sure to set the property. /// public TextToSpeechVoice Voice { get { return voice; } set { voice = value; } } [Tooltip("The voice that will be used to generate speech. To use a non en-US voice, set this to Other.")] [SerializeField] private TextToSpeechVoice voice; /// /// Gets or sets the voice that will be used to generate speech. /// /// /// It is required to set the voice through this property when using a custom voice. /// public string VoiceName { get { return Voice != TextToSpeechVoice.Other ? Voice.ToString() : customVoice; } set { if (Enum.TryParse(value, out TextToSpeechVoice parsedVoice)) { Voice = parsedVoice; } else { Voice = TextToSpeechVoice.Other; customVoice = value; } } } [Tooltip("The custom voice that will be used to generate speech. See below for the list of available voices.")] [SerializeField] private string customVoice = string.Empty; #if WINDOWS_UWP private SpeechSynthesizer synthesizer; private VoiceInformation voiceInfo; private bool speechTextInQueue = false; #endif /// /// Converts two bytes to one float in the range -1 to 1. /// /// The first byte. /// The second byte. /// The converted float. private static float BytesToFloat(byte firstByte, byte secondByte) { // Convert two bytes to one short (little endian) short s = (short)((secondByte << 8) | firstByte); // Convert to range from -1 to (just below) 1 return s / 32768.0F; } /// /// Converts an array of bytes to an integer. /// /// The byte array. /// An offset to read from. /// The converted int. private static int BytesToInt(byte[] bytes, int offset = 0) { int value = 0; for (int i = 0; i < 4; i++) { value |= ((int)bytes[offset + i]) << (i * 8); } return value; } /// /// Dynamically creates an that represents raw Unity audio data. /// /// The name of the dynamically generated clip. /// Raw Unity audio data. /// The number of samples in the audio data. /// The frequency of the audio data. /// The . private static AudioClip ToClip(string name, float[] audioData, int sampleCount, int frequency) { var clip = AudioClip.Create(name, sampleCount, 1, frequency, false); clip.SetData(audioData, 0); return clip; } /// /// Converts raw WAV data into Unity formatted audio data. /// /// The raw WAV data. /// The number of samples in the audio data. /// The frequency of the audio data. /// The Unity formatted audio data. private static float[] ToUnityAudio(byte[] wavAudio, out int sampleCount, out int frequency) { // Determine if mono or stereo int channelCount = wavAudio[22]; // Speech audio data is always mono but read actual header value for processing // Get the frequency frequency = BytesToInt(wavAudio, 24); // Get past all the other sub chunks to get to the data subchunk: int pos = 12; // First subchunk ID from 12 to 16 // Keep iterating until we find the data chunk (i.e. 64 61 74 61 ...... (i.e. 100 97 116 97 in decimal)) while (!(wavAudio[pos] == 100 && wavAudio[pos + 1] == 97 && wavAudio[pos + 2] == 116 && wavAudio[pos + 3] == 97)) { pos += 4; int chunkSize = wavAudio[pos] + wavAudio[pos + 1] * 256 + wavAudio[pos + 2] * 65536 + wavAudio[pos + 3] * 16777216; pos += 4 + chunkSize; } pos += 8; // Pos is now positioned to start of actual sound data. sampleCount = (wavAudio.Length - pos) / 2; // 2 bytes per sample (16 bit sound mono) if (channelCount == 2) { sampleCount /= 2; } // 4 bytes per sample (16 bit stereo) // Allocate memory (supporting left channel only) var unityData = new float[sampleCount]; // Write to double array/s: int i = 0; while (pos < wavAudio.Length) { unityData[i] = BytesToFloat(wavAudio[pos], wavAudio[pos + 1]); pos += 2; if (channelCount == 2) { pos += 2; } i++; } return unityData; } #if WINDOWS_UWP /// /// Executes a function that generates a speech stream and then converts and plays it in Unity. /// /// /// A raw text version of what's being spoken for use in debug messages when speech isn't supported. /// /// /// The actual function that will be executed to generate speech. /// private void PlaySpeech(string text, Func> speakFunc) { // Make sure there's something to speak if (speakFunc == null) throw new ArgumentNullException(nameof(speakFunc)); if (synthesizer != null) { try { speechTextInQueue = true; // Need await, so most of this will be run as a new Task in its own thread. // This is good since it frees up Unity to keep running anyway. Task.Run(async () => { // Change voice? if (voice != TextToSpeechVoice.Default) { // See if it's never been found or is changing if ((voiceInfo == null) || (!voiceInfo.DisplayName.Contains(VoiceName))) { // Search for voice info voiceInfo = SpeechSynthesizer.AllVoices.Where(v => v.DisplayName.Contains(VoiceName)).FirstOrDefault(); // If found, select if (voiceInfo != null) { synthesizer.Voice = voiceInfo; } else { Debug.LogErrorFormat("TTS voice {0} could not be found.", VoiceName); } } } else { synthesizer.Voice = SpeechSynthesizer.DefaultVoice; } // Speak and get stream var speechStream = await speakFunc(); // Get the size of the original stream var size = speechStream.Size; // Create buffer byte[] buffer = new byte[(int)size]; // Get input stream and the size of the original stream using (var inputStream = speechStream.GetInputStreamAt(0)) { // Close the original speech stream to free up memory speechStream.Dispose(); // Create a new data reader off the input stream using (var dataReader = new DataReader(inputStream)) { // Load all bytes into the reader await dataReader.LoadAsync((uint)size); // Copy from reader into buffer dataReader.ReadBytes(buffer); } } // Convert raw WAV data into Unity audio data int sampleCount = 0; int frequency = 0; var unityData = ToUnityAudio(buffer, out sampleCount, out frequency); // The remainder must be done back on Unity's main thread UnityEngine.WSA.Application.InvokeOnAppThread(() => { // Convert to an audio clip var clip = ToClip("Speech", unityData, sampleCount, frequency); // Set the source on the audio clip audioSource.clip = clip; // Play audio audioSource.Play(); speechTextInQueue = false; }, false); }); } catch (Exception ex) { speechTextInQueue = false; Debug.LogErrorFormat("Speech generation problem: \"{0}\"", ex.Message); } } else { Debug.LogErrorFormat("Speech not initialized. \"{0}\"", text); } } #endif private void Awake() { try { if (audioSource == null) { audioSource = GetComponent(); } #if WINDOWS_UWP synthesizer = new SpeechSynthesizer(); #endif } catch (Exception ex) { Debug.LogError("Could not start Speech Synthesis: " + ex.Message); } } // Public Methods /// /// Speaks the specified SSML markup using text-to-speech. /// /// The SSML markup to speak. public void SpeakSsml(string ssml) { // Make sure there's something to speak if (string.IsNullOrEmpty(ssml)) { return; } // Pass to helper method #if WINDOWS_UWP PlaySpeech(ssml, () => synthesizer.SynthesizeSsmlToStreamAsync(ssml)); #else Debug.LogWarningFormat("Text to Speech not supported in editor.\n\"{0}\"", ssml); #endif } /// /// Speaks the specified text using text-to-speech. /// /// The text to speak. public void StartSpeaking(string text) { // Make sure there's something to speak if (string.IsNullOrEmpty(text)) { return; } // Pass to helper method #if WINDOWS_UWP PlaySpeech(text, ()=> synthesizer.SynthesizeTextToStreamAsync(text)); #else Debug.LogWarningFormat("Text to Speech not supported in editor.\n\"{0}\"", text); #endif } /// /// Returns info whether a text is submitted and being processed by PlaySpeech method /// Handy for avoiding situations when a text is submitted, but audio clip is not yet ready because the audio source isn't playing yet. /// Example: yield return new WaitWhile(() => textToSpeechManager.SpeechTextInQueue() || textToSpeechManager.IsSpeaking()) /// public bool SpeechTextInQueue() { #if WINDOWS_UWP return speechTextInQueue; #else return false; #endif } /// /// Returns whether or not the AudioSource is actively playing. /// /// /// True, if the AudioSource is playing. False, if the AudioSource is not playing or is null. /// public bool IsSpeaking() { if (audioSource != null) { return audioSource.isPlaying; } return false; } /// /// Stops text-to-speech playback. /// public void StopSpeaking() { if (IsSpeaking()) { audioSource.Stop(); } } } }