mixedreality/com.microsoft.mixedreality..../SDK/Features/Audio/TextToSpeech.cs

420 lines
16 KiB
C#

// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
using System;
using UnityEngine;
#if WINDOWS_UWP
using Windows.Foundation;
using Windows.Media.SpeechSynthesis;
using Windows.Storage.Streams;
using System.Linq;
using System.Threading.Tasks;
#endif
namespace Microsoft.MixedReality.Toolkit.Audio
{
/// <summary>
/// The en-US voices that can be used by <see cref="TextToSpeech"/>. Voices for all other locales are categorized as Other.
/// </summary>
public enum TextToSpeechVoice
{
/// <summary>
/// The default system voice.
/// </summary>
Default,
/// <summary>
/// Microsoft David voice
/// </summary>
David,
/// <summary>
/// Microsoft Mark voice
/// </summary>
Mark,
/// <summary>
/// Microsoft Zira voice
/// </summary>
Zira,
/// <summary>
/// Voice not listed above (for non en-US languages)
/// </summary>
Other
}
/// <summary>
/// Enables text to speech using the Windows 10 SpeechSynthesizer class.
/// </summary>
/// <remarks>
/// <para>SpeechSynthesizer generates speech as a SpeechSynthesisStream.</para>
/// <para>This class converts that stream into a Unity AudioClip and plays the clip using
/// the <see cref="AudioSource"/> you supply in the inspector. This allows you to position the voice
/// as desired in 3D space. One recommended approach is to place the AudioSource on an empty
/// GameObject that is a child of Main Camera and position it approximately 0.6 units above the
/// camera. This orientation will sound similar to Cortana's speech in the OS.</para>
/// </remarks>
[RequireComponent(typeof(AudioSource))]
[AddComponentMenu("Scripts/MRTK/SDK/TextToSpeech")]
public class TextToSpeech : MonoBehaviour
{
[Tooltip("The audio source where speech will be played.")]
[SerializeField]
private AudioSource audioSource;
/// <summary>
/// Gets or sets the audio source where speech will be played.
/// </summary>
public AudioSource AudioSource { get { return audioSource; } set { audioSource = value; } }
/// <summary>
/// Gets or sets the voice that will be used to generate speech. To use a non en-US voice, set this to Other.
/// </summary>
/// <remarks>
/// If a custom voice is desired (i.e. this enum is being set to Other) make sure to set the <see cref="VoiceName"/> property.
/// </remarks>
public TextToSpeechVoice Voice { get { return voice; } set { voice = value; } }
[Tooltip("The voice that will be used to generate speech. To use a non en-US voice, set this to Other.")]
[SerializeField]
private TextToSpeechVoice voice;
/// <summary>
/// Gets or sets the voice that will be used to generate speech.
/// </summary>
/// <remarks>
/// It is required to set the voice through this property when using a custom voice.
/// </remarks>
public string VoiceName
{
get
{
return Voice != TextToSpeechVoice.Other ? Voice.ToString() : customVoice;
}
set
{
if (Enum.TryParse(value, out TextToSpeechVoice parsedVoice))
{
Voice = parsedVoice;
}
else
{
Voice = TextToSpeechVoice.Other;
customVoice = value;
}
}
}
[Tooltip("The custom voice that will be used to generate speech. See below for the list of available voices.")]
[SerializeField]
private string customVoice = string.Empty;
#if WINDOWS_UWP
private SpeechSynthesizer synthesizer;
private VoiceInformation voiceInfo;
private bool speechTextInQueue = false;
#endif
/// <summary>
/// Converts two bytes to one float in the range -1 to 1.
/// </summary>
/// <param name="firstByte">The first byte.</param>
/// <param name="secondByte"> The second byte.</param>
/// <returns>The converted float.</returns>
private static float BytesToFloat(byte firstByte, byte secondByte)
{
// Convert two bytes to one short (little endian)
short s = (short)((secondByte << 8) | firstByte);
// Convert to range from -1 to (just below) 1
return s / 32768.0F;
}
/// <summary>
/// Converts an array of bytes to an integer.
/// </summary>
/// <param name="bytes"> The byte array.</param>
/// <param name="offset"> An offset to read from.</param>
/// <returns>The converted int.</returns>
private static int BytesToInt(byte[] bytes, int offset = 0)
{
int value = 0;
for (int i = 0; i < 4; i++)
{
value |= ((int)bytes[offset + i]) << (i * 8);
}
return value;
}
/// <summary>
/// Dynamically creates an <see cref="AudioClip"/> that represents raw Unity audio data.
/// </summary>
/// <param name="name"> The name of the dynamically generated clip.</param>
/// <param name="audioData">Raw Unity audio data.</param>
/// <param name="sampleCount">The number of samples in the audio data.</param>
/// <param name="frequency">The frequency of the audio data.</param>
/// <returns>The <see cref="AudioClip"/>.</returns>
private static AudioClip ToClip(string name, float[] audioData, int sampleCount, int frequency)
{
var clip = AudioClip.Create(name, sampleCount, 1, frequency, false);
clip.SetData(audioData, 0);
return clip;
}
/// <summary>
/// Converts raw WAV data into Unity formatted audio data.
/// </summary>
/// <param name="wavAudio">The raw WAV data.</param>
/// <param name="sampleCount">The number of samples in the audio data.</param>
/// <param name="frequency">The frequency of the audio data.</param>
/// <returns>The Unity formatted audio data. </returns>
private static float[] ToUnityAudio(byte[] wavAudio, out int sampleCount, out int frequency)
{
// Determine if mono or stereo
int channelCount = wavAudio[22]; // Speech audio data is always mono but read actual header value for processing
// Get the frequency
frequency = BytesToInt(wavAudio, 24);
// Get past all the other sub chunks to get to the data subchunk:
int pos = 12; // First subchunk ID from 12 to 16
// Keep iterating until we find the data chunk (i.e. 64 61 74 61 ...... (i.e. 100 97 116 97 in decimal))
while (!(wavAudio[pos] == 100 && wavAudio[pos + 1] == 97 && wavAudio[pos + 2] == 116 && wavAudio[pos + 3] == 97))
{
pos += 4;
int chunkSize = wavAudio[pos] + wavAudio[pos + 1] * 256 + wavAudio[pos + 2] * 65536 + wavAudio[pos + 3] * 16777216;
pos += 4 + chunkSize;
}
pos += 8;
// Pos is now positioned to start of actual sound data.
sampleCount = (wavAudio.Length - pos) / 2; // 2 bytes per sample (16 bit sound mono)
if (channelCount == 2) { sampleCount /= 2; } // 4 bytes per sample (16 bit stereo)
// Allocate memory (supporting left channel only)
var unityData = new float[sampleCount];
// Write to double array/s:
int i = 0;
while (pos < wavAudio.Length)
{
unityData[i] = BytesToFloat(wavAudio[pos], wavAudio[pos + 1]);
pos += 2;
if (channelCount == 2)
{
pos += 2;
}
i++;
}
return unityData;
}
#if WINDOWS_UWP
/// <summary>
/// Executes a function that generates a speech stream and then converts and plays it in Unity.
/// </summary>
/// <param name="text">
/// A raw text version of what's being spoken for use in debug messages when speech isn't supported.
/// </param>
/// <param name="speakFunc">
/// The actual function that will be executed to generate speech.
/// </param>
private void PlaySpeech(string text, Func<IAsyncOperation<SpeechSynthesisStream>> speakFunc)
{
// Make sure there's something to speak
if (speakFunc == null) throw new ArgumentNullException(nameof(speakFunc));
if (synthesizer != null)
{
try
{
speechTextInQueue = true;
// Need await, so most of this will be run as a new Task in its own thread.
// This is good since it frees up Unity to keep running anyway.
Task.Run(async () =>
{
// Change voice?
if (voice != TextToSpeechVoice.Default)
{
// See if it's never been found or is changing
if ((voiceInfo == null) || (!voiceInfo.DisplayName.Contains(VoiceName)))
{
// Search for voice info
voiceInfo = SpeechSynthesizer.AllVoices.Where(v => v.DisplayName.Contains(VoiceName)).FirstOrDefault();
// If found, select
if (voiceInfo != null)
{
synthesizer.Voice = voiceInfo;
}
else
{
Debug.LogErrorFormat("TTS voice {0} could not be found.", VoiceName);
}
}
}
else
{
synthesizer.Voice = SpeechSynthesizer.DefaultVoice;
}
// Speak and get stream
var speechStream = await speakFunc();
// Get the size of the original stream
var size = speechStream.Size;
// Create buffer
byte[] buffer = new byte[(int)size];
// Get input stream and the size of the original stream
using (var inputStream = speechStream.GetInputStreamAt(0))
{
// Close the original speech stream to free up memory
speechStream.Dispose();
// Create a new data reader off the input stream
using (var dataReader = new DataReader(inputStream))
{
// Load all bytes into the reader
await dataReader.LoadAsync((uint)size);
// Copy from reader into buffer
dataReader.ReadBytes(buffer);
}
}
// Convert raw WAV data into Unity audio data
int sampleCount = 0;
int frequency = 0;
var unityData = ToUnityAudio(buffer, out sampleCount, out frequency);
// The remainder must be done back on Unity's main thread
UnityEngine.WSA.Application.InvokeOnAppThread(() =>
{
// Convert to an audio clip
var clip = ToClip("Speech", unityData, sampleCount, frequency);
// Set the source on the audio clip
audioSource.clip = clip;
// Play audio
audioSource.Play();
speechTextInQueue = false;
}, false);
});
}
catch (Exception ex)
{
speechTextInQueue = false;
Debug.LogErrorFormat("Speech generation problem: \"{0}\"", ex.Message);
}
}
else
{
Debug.LogErrorFormat("Speech not initialized. \"{0}\"", text);
}
}
#endif
private void Awake()
{
try
{
if (audioSource == null)
{
audioSource = GetComponent<AudioSource>();
}
#if WINDOWS_UWP
synthesizer = new SpeechSynthesizer();
#endif
}
catch (Exception ex)
{
Debug.LogError("Could not start Speech Synthesis: " + ex.Message);
}
}
// Public Methods
/// <summary>
/// Speaks the specified SSML markup using text-to-speech.
/// </summary>
/// <param name="ssml">The SSML markup to speak.</param>
public void SpeakSsml(string ssml)
{
// Make sure there's something to speak
if (string.IsNullOrEmpty(ssml)) { return; }
// Pass to helper method
#if WINDOWS_UWP
PlaySpeech(ssml, () => synthesizer.SynthesizeSsmlToStreamAsync(ssml));
#else
Debug.LogWarningFormat("Text to Speech not supported in editor.\n\"{0}\"", ssml);
#endif
}
/// <summary>
/// Speaks the specified text using text-to-speech.
/// </summary>
/// <param name="text">The text to speak.</param>
public void StartSpeaking(string text)
{
// Make sure there's something to speak
if (string.IsNullOrEmpty(text)) { return; }
// Pass to helper method
#if WINDOWS_UWP
PlaySpeech(text, ()=> synthesizer.SynthesizeTextToStreamAsync(text));
#else
Debug.LogWarningFormat("Text to Speech not supported in editor.\n\"{0}\"", text);
#endif
}
/// <summary>
/// Returns info whether a text is submitted and being processed by PlaySpeech method
/// Handy for avoiding situations when a text is submitted, but audio clip is not yet ready because the audio source isn't playing yet.
/// Example: yield return new WaitWhile(() => textToSpeechManager.SpeechTextInQueue() || textToSpeechManager.IsSpeaking())
/// </summary>
public bool SpeechTextInQueue()
{
#if WINDOWS_UWP
return speechTextInQueue;
#else
return false;
#endif
}
/// <summary>
/// Returns whether or not the AudioSource is actively playing.
/// </summary>
/// <returns>
/// True, if the AudioSource is playing. False, if the AudioSource is not playing or is null.
/// </returns>
public bool IsSpeaking()
{
if (audioSource != null)
{
return audioSource.isPlaying;
}
return false;
}
/// <summary>
/// Stops text-to-speech playback.
/// </summary>
public void StopSpeaking()
{
if (IsSpeaking())
{
audioSource.Stop();
}
}
}
}