Fix Kokoro TTS integration issues: Remove API key requirement and ensure system-specific options display correctly
This commit is contained in:
@@ -0,0 +1,570 @@
|
||||
/**
|
||||
* BrowserTTSModule for AI Interactive Fiction
|
||||
* Implementation using the browser's Web Speech API
|
||||
*/
|
||||
import { TTSHandlerModule } from './tts-handler-module.js';
|
||||
|
||||
/**
|
||||
* Browser TTS Module - Uses the browser's Web Speech API for TTS
|
||||
*/
|
||||
export class BrowserTTSModule extends TTSHandlerModule {
|
||||
constructor() {
|
||||
super('browser', 'Browser TTS');
|
||||
|
||||
// Voice options
|
||||
this.voiceOptions = {
|
||||
voice: null, // Will be set during initialization
|
||||
rate: 1.0,
|
||||
pitch: 1.0,
|
||||
volume: 1.0
|
||||
};
|
||||
|
||||
// State
|
||||
this.available = false;
|
||||
this.currentUtterance = null;
|
||||
|
||||
// Ensure dependencies are correctly defined from parent class
|
||||
// this.dependencies should already contain ['persistence-manager', 'localization']
|
||||
|
||||
// Bind additional methods beyond those in TTSHandlerModule
|
||||
this.bindMethods([
|
||||
'onVoicesChanged',
|
||||
'loadVoices',
|
||||
'selectVoiceForLocale',
|
||||
'synthesizeToWav',
|
||||
'speakPreloaded',
|
||||
'speak',
|
||||
'preprocessText',
|
||||
'inferVoiceGender'
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the browser TTS module
|
||||
* @returns {Promise<boolean>} - Resolves with success status
|
||||
*/
|
||||
async initialize() {
|
||||
try {
|
||||
this.reportProgress(10, 'Initializing Browser TTS');
|
||||
|
||||
// Check for browser support
|
||||
if (!window.speechSynthesis) {
|
||||
console.error('Browser TTS: Speech synthesis not available in this browser');
|
||||
return false;
|
||||
}
|
||||
|
||||
this.reportProgress(30, 'Browser TTS supported');
|
||||
|
||||
// Initialize parent
|
||||
const parentInit = await super.initialize();
|
||||
if (!parentInit) {
|
||||
console.error('Browser TTS: Parent initialization failed');
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get required dependencies
|
||||
const persistenceManager = this.getModule('persistence-manager');
|
||||
if (!persistenceManager) {
|
||||
console.error('Browser TTS: Required dependency persistence-manager not found');
|
||||
return false;
|
||||
}
|
||||
|
||||
const localization = this.getModule('localization');
|
||||
if (!localization) {
|
||||
console.error('Browser TTS: Required dependency localization not found');
|
||||
return false;
|
||||
}
|
||||
|
||||
// Load voices
|
||||
const voicesLoaded = await this.loadVoices();
|
||||
if (!voicesLoaded) {
|
||||
console.error('Browser TTS: Failed to load voices');
|
||||
return false;
|
||||
}
|
||||
|
||||
// Set speech options from preferences
|
||||
this.voiceOptions.rate = persistenceManager.getPreference('tts', 'rate', 1.0);
|
||||
this.voiceOptions.pitch = persistenceManager.getPreference('tts', 'pitch', 1.0);
|
||||
this.voiceOptions.volume = persistenceManager.getPreference('tts', 'volume', 1.0);
|
||||
const preferredVoice = persistenceManager.getPreference('tts', 'browser_voice', '');
|
||||
|
||||
// Set voice based on current locale
|
||||
const currentLocale = localization.getLocale() || 'en-us';
|
||||
await this.selectVoiceForLocale(currentLocale, preferredVoice);
|
||||
|
||||
// Listen for locale changes
|
||||
document.addEventListener('locale:changed', async (event) => {
|
||||
if (event.detail && event.detail.locale) {
|
||||
await this.selectVoiceForLocale(event.detail.locale);
|
||||
}
|
||||
});
|
||||
|
||||
// Listen for voices changed events
|
||||
if (window.speechSynthesis.onvoiceschanged !== undefined) {
|
||||
window.speechSynthesis.onvoiceschanged = this.onVoicesChanged;
|
||||
}
|
||||
|
||||
this.isReady = true;
|
||||
this.available = true;
|
||||
this.reportProgress(100, 'Browser TTS initialized');
|
||||
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error('Browser TTS: Initialization error:', error);
|
||||
this.isReady = false;
|
||||
this.available = false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle voices changed event
|
||||
*/
|
||||
async onVoicesChanged() {
|
||||
await this.loadVoices();
|
||||
|
||||
// Re-select voice based on current locale
|
||||
const localization = this.getModule('localization');
|
||||
const persistenceManager = this.getModule('persistence-manager');
|
||||
|
||||
if (localization && persistenceManager) {
|
||||
const currentLocale = localization.getLocale() || 'en-us';
|
||||
const preferredVoice = persistenceManager.getPreference('tts', 'browser_voice', '');
|
||||
await this.selectVoiceForLocale(currentLocale, preferredVoice);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load available voices from the speech synthesis API
|
||||
* @returns {Promise<boolean>} - Resolves with success status
|
||||
*/
|
||||
async loadVoices() {
|
||||
try {
|
||||
this.reportProgress(40, 'Loading browser voices');
|
||||
|
||||
// Try to get voices
|
||||
let voices = window.speechSynthesis.getVoices();
|
||||
|
||||
// If voices array is empty, wait for onvoiceschanged event
|
||||
if (!voices || voices.length === 0) {
|
||||
try {
|
||||
console.log('Browser TTS: No voices available immediately, waiting for voices to load...');
|
||||
|
||||
// Wait for voices to be loaded (with timeout)
|
||||
voices = await new Promise((resolve, reject) => {
|
||||
// Set a timeout in case voices never load
|
||||
const timeout = setTimeout(() => {
|
||||
console.warn('Browser TTS: Timeout waiting for voices');
|
||||
// Resolve with empty array instead of rejecting
|
||||
resolve([]);
|
||||
}, 3000);
|
||||
|
||||
// Listen for voices changed event
|
||||
window.speechSynthesis.onvoiceschanged = () => {
|
||||
clearTimeout(timeout);
|
||||
const loadedVoices = window.speechSynthesis.getVoices();
|
||||
console.log(`Browser TTS: Voices loaded, found ${loadedVoices.length} voices`);
|
||||
resolve(loadedVoices);
|
||||
};
|
||||
});
|
||||
} catch (voiceWaitError) {
|
||||
console.error('Browser TTS: Error waiting for voices:', voiceWaitError);
|
||||
// Continue with empty voices array
|
||||
voices = [];
|
||||
}
|
||||
}
|
||||
|
||||
// Store voices
|
||||
this.voices = voices || [];
|
||||
|
||||
// Log available voices for debugging
|
||||
console.log(`Browser TTS: Loaded ${this.voices.length} voices`);
|
||||
if (this.voices.length > 0) {
|
||||
console.log('Browser TTS: First few voices:', this.voices.slice(0, 3));
|
||||
}
|
||||
|
||||
// If no voices available but speech synthesis is supported, still return true
|
||||
// Some browsers may not expose voices but still support speech synthesis
|
||||
if (this.voices.length === 0) {
|
||||
console.warn('Browser TTS: No voices available, but continuing with default voice');
|
||||
// Create a default voice entry
|
||||
this.voices = [{
|
||||
default: true,
|
||||
lang: 'en-US',
|
||||
localService: true,
|
||||
name: 'Default Voice',
|
||||
voiceURI: 'default'
|
||||
}];
|
||||
}
|
||||
|
||||
this.reportProgress(60, 'Browser voices loaded');
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error('Browser TTS: Error loading voices:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set voice based on locale
|
||||
* @param {string} locale - Locale code (e.g., 'en-us', 'de', 'fr')
|
||||
* @param {string} preferredVoice - Optional preferred voice name
|
||||
* @returns {Promise<boolean>} - Success status
|
||||
*/
|
||||
async selectVoiceForLocale(locale = 'en-us', preferredVoice = '') {
|
||||
// Normalize locale format
|
||||
locale = locale.toLowerCase().replace('_', '-');
|
||||
const languageCode = locale.split('-')[0];
|
||||
|
||||
// First try to use the preferred voice if specified
|
||||
if (preferredVoice) {
|
||||
const voice = this.voices.find(v =>
|
||||
v.name === preferredVoice ||
|
||||
v.voiceURI === preferredVoice
|
||||
);
|
||||
|
||||
if (voice) {
|
||||
this.voiceOptions.voice = voice;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Try to find a voice that matches the exact locale
|
||||
const exactMatch = this.voices.find(v =>
|
||||
v.lang.toLowerCase() === locale
|
||||
);
|
||||
|
||||
if (exactMatch) {
|
||||
this.voiceOptions.voice = exactMatch;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Try to find a voice that matches the language code
|
||||
const languageMatch = this.voices.find(v =>
|
||||
v.lang.toLowerCase().startsWith(languageCode)
|
||||
);
|
||||
|
||||
if (languageMatch) {
|
||||
this.voiceOptions.voice = languageMatch;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Fallback to the first available voice
|
||||
if (this.voices.length > 0) {
|
||||
this.voiceOptions.voice = this.voices[0];
|
||||
return true;
|
||||
}
|
||||
|
||||
// No voices available
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Speak text
|
||||
* @param {string} text - Text to speak
|
||||
* @param {Function} callback - Callback for when speech completes
|
||||
* @returns {boolean} - Success status
|
||||
*/
|
||||
speak(text, callback = null) {
|
||||
if (!this.isReady || !window.speechSynthesis) {
|
||||
if (callback) {
|
||||
callback({ success: false, reason: 'not_ready' });
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Stop any ongoing speech
|
||||
this.stop();
|
||||
|
||||
const processedText = this.preprocessText(text);
|
||||
|
||||
// Create utterance
|
||||
const utterance = new SpeechSynthesisUtterance(processedText);
|
||||
|
||||
// Set options
|
||||
if (this.voiceOptions.voice) {
|
||||
utterance.voice = this.voiceOptions.voice;
|
||||
}
|
||||
|
||||
utterance.rate = this.voiceOptions.rate;
|
||||
utterance.pitch = this.voiceOptions.pitch;
|
||||
utterance.volume = this.voiceOptions.volume;
|
||||
|
||||
// Set up event handlers
|
||||
utterance.onend = () => {
|
||||
this.isSpeaking = false;
|
||||
if (callback) {
|
||||
callback({ success: true });
|
||||
}
|
||||
};
|
||||
|
||||
utterance.onerror = (error) => {
|
||||
this.isSpeaking = false;
|
||||
console.error('Browser TTS: Speech error', error);
|
||||
if (callback) {
|
||||
callback({ success: false, reason: 'synthesis_error', error });
|
||||
}
|
||||
};
|
||||
|
||||
// Store current utterance
|
||||
this.currentUtterance = utterance;
|
||||
this.isSpeaking = true;
|
||||
|
||||
// Start speaking
|
||||
window.speechSynthesis.speak(utterance);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Preload speech for a text
|
||||
* @param {string} text - Text to preload
|
||||
* @returns {Promise<Object>} - Preloaded speech data
|
||||
*/
|
||||
async preloadSpeech(text) {
|
||||
if (!this.isReady || !window.speechSynthesis) {
|
||||
return { success: false, reason: 'not_ready' };
|
||||
}
|
||||
|
||||
// Generate WAV audio data
|
||||
const wavResult = await this.synthesizeToWav(text);
|
||||
|
||||
if (!wavResult.success) {
|
||||
return { success: false, reason: 'synthesis_failed' };
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
audioData: wavResult.audioData,
|
||||
text,
|
||||
duration: wavResult.duration || 0
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert speech synthesis to WAV format
|
||||
* @param {string} text - Text to synthesize
|
||||
* @returns {Promise<Object>} - Object with audio data
|
||||
*/
|
||||
async synthesizeToWav(text) {
|
||||
return new Promise((resolve) => {
|
||||
if (!this.isReady || !window.speechSynthesis) {
|
||||
resolve({ success: false, reason: 'not_ready' });
|
||||
return;
|
||||
}
|
||||
|
||||
// Process text for better synthesis
|
||||
const processedText = this.preprocessText(text);
|
||||
|
||||
// Create audio context
|
||||
const AudioContext = window.AudioContext || window.webkitAudioContext;
|
||||
if (!AudioContext) {
|
||||
resolve({ success: false, reason: 'no_audio_context' });
|
||||
return;
|
||||
}
|
||||
|
||||
const audioContext = new AudioContext();
|
||||
|
||||
// Create media stream destination
|
||||
const destination = audioContext.createMediaStreamDestination();
|
||||
|
||||
// Create media recorder
|
||||
const mediaRecorder = new MediaRecorder(destination.stream);
|
||||
const audioChunks = [];
|
||||
|
||||
// Set up event handlers
|
||||
mediaRecorder.ondataavailable = (event) => {
|
||||
if (event.data.size > 0) {
|
||||
audioChunks.push(event.data);
|
||||
}
|
||||
};
|
||||
|
||||
mediaRecorder.onstop = () => {
|
||||
// Create blob from chunks
|
||||
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
|
||||
|
||||
// Convert blob to array buffer
|
||||
const reader = new FileReader();
|
||||
reader.onloadend = () => {
|
||||
resolve({
|
||||
success: true,
|
||||
audioData: reader.result
|
||||
});
|
||||
};
|
||||
|
||||
reader.onerror = () => {
|
||||
resolve({ success: false, reason: 'blob_read_error' });
|
||||
};
|
||||
|
||||
reader.readAsArrayBuffer(audioBlob);
|
||||
};
|
||||
|
||||
// Create utterance
|
||||
const utterance = new SpeechSynthesisUtterance(processedText);
|
||||
|
||||
// Set options
|
||||
if (this.voiceOptions.voice) {
|
||||
utterance.voice = this.voiceOptions.voice;
|
||||
}
|
||||
|
||||
utterance.rate = this.voiceOptions.rate;
|
||||
utterance.pitch = this.voiceOptions.pitch;
|
||||
utterance.volume = this.voiceOptions.volume;
|
||||
|
||||
// Start recording
|
||||
mediaRecorder.start();
|
||||
|
||||
// Set up completion handling
|
||||
utterance.onend = () => {
|
||||
mediaRecorder.stop();
|
||||
};
|
||||
|
||||
utterance.onerror = (error) => {
|
||||
console.error('Browser TTS: Synthesis error', error);
|
||||
mediaRecorder.stop();
|
||||
resolve({ success: false, reason: 'synthesis_error' });
|
||||
};
|
||||
|
||||
// Start speaking
|
||||
window.speechSynthesis.speak(utterance);
|
||||
|
||||
// Set timeout in case onend never fires
|
||||
setTimeout(() => {
|
||||
if (mediaRecorder.state === 'recording') {
|
||||
mediaRecorder.stop();
|
||||
}
|
||||
}, 30000); // 30-second timeout
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Speak preloaded audio data
|
||||
* @param {Object} preloadedData - Data from preloadSpeech
|
||||
* @param {Function} callback - Callback for when speech completes
|
||||
* @returns {boolean} - Success status
|
||||
*/
|
||||
speakPreloaded(preloadedData, callback = null) {
|
||||
if (!preloadedData || !preloadedData.text) {
|
||||
console.error('Browser TTS: Invalid preloaded data');
|
||||
return false;
|
||||
}
|
||||
|
||||
// For browser TTS, we don't use the preloaded data directly
|
||||
// Instead, we just speak the text again
|
||||
return this.speak(preloadedData.text, callback);
|
||||
}
|
||||
|
||||
/**
|
||||
* Preprocess text for TTS
|
||||
* @param {string} text - Text to preprocess
|
||||
* @returns {string} - Processed text
|
||||
*/
|
||||
preprocessText(text) {
|
||||
// Remove HTML tags
|
||||
text = text.replace(/<[^>]*>/g, ' ');
|
||||
|
||||
// Replace special characters with their spoken equivalents
|
||||
text = text.replace(/&/g, ' and ');
|
||||
|
||||
// Normalize whitespace
|
||||
text = text.replace(/\s+/g, ' ').trim();
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop speaking
|
||||
* @returns {boolean} - Success status
|
||||
*/
|
||||
stop() {
|
||||
if (window.speechSynthesis) {
|
||||
window.speechSynthesis.cancel();
|
||||
this.isSpeaking = false;
|
||||
this.currentUtterance = null;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get available voices
|
||||
* @returns {Array} - Array of voice objects
|
||||
*/
|
||||
async getVoices() {
|
||||
if (!this.isReady) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const localization = this.getModule('localization');
|
||||
const currentLocale = localization ? localization.getLocale() : 'en-us';
|
||||
|
||||
// Normalize locale format
|
||||
const normalizedLocale = currentLocale.toLowerCase().replace('_', '-');
|
||||
const languageCode = normalizedLocale.split('-')[0];
|
||||
|
||||
// Filter voices by current locale
|
||||
const filteredVoices = this.voices.filter(voice => {
|
||||
const voiceLang = voice.lang.toLowerCase();
|
||||
return voiceLang.startsWith(languageCode) ||
|
||||
voiceLang === normalizedLocale ||
|
||||
(normalizedLocale.startsWith(voiceLang) && voiceLang.length === 2);
|
||||
});
|
||||
|
||||
// If matching voices found, use them
|
||||
if (filteredVoices.length > 0) {
|
||||
return filteredVoices.map(voice => ({
|
||||
id: voice.voiceURI,
|
||||
name: voice.name,
|
||||
lang: voice.lang,
|
||||
gender: this.inferVoiceGender(voice.name)
|
||||
}));
|
||||
}
|
||||
|
||||
// If no matching voices found, return all voices
|
||||
return this.voices.map(voice => ({
|
||||
id: voice.voiceURI,
|
||||
name: voice.name,
|
||||
lang: voice.lang,
|
||||
gender: this.inferVoiceGender(voice.name)
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Infer voice gender from name
|
||||
* @param {string} name - Voice name
|
||||
* @returns {string} - Inferred gender ('male', 'female', or 'unknown')
|
||||
*/
|
||||
inferVoiceGender(name) {
|
||||
const lowerName = name.toLowerCase();
|
||||
|
||||
// Common terms indicating gender
|
||||
const maleTerms = ['male', 'man', 'guy', 'boy', 'mr', 'sir'];
|
||||
const femaleTerms = ['female', 'woman', 'lady', 'girl', 'ms', 'mrs', 'miss'];
|
||||
|
||||
// Check for explicit gender terms in the name
|
||||
for (const term of maleTerms) {
|
||||
if (lowerName.includes(term)) return 'male';
|
||||
}
|
||||
|
||||
for (const term of femaleTerms) {
|
||||
if (lowerName.includes(term)) return 'female';
|
||||
}
|
||||
|
||||
return 'unknown';
|
||||
}
|
||||
}
|
||||
|
||||
// Register the module with the module registry
|
||||
// Module registry MUST be accessed via window, not direct import
|
||||
if (window.moduleRegistry) {
|
||||
try {
|
||||
// Create instance first, then register it
|
||||
const browserTTSModule = new BrowserTTSModule();
|
||||
window.moduleRegistry.register(browserTTSModule);
|
||||
console.log('Browser TTS Module registered successfully');
|
||||
} catch (err) {
|
||||
console.error('Failed to register Browser TTS Module:', err);
|
||||
}
|
||||
} else {
|
||||
console.error('Module registry not available when attempting to register Browser TTS Module');
|
||||
}
|
||||
Reference in New Issue
Block a user