/** * Kokoro Text-to-Speech Handler for AI Interactive Fiction * Uses the kokoro-js library for high-quality TTS */ class KokoroHandler { constructor() { this.enabled = false; this.speaking = false; this.paused = false; this.audio = null; this.currentSpeed = 1.0; // Note: KokoroTTS might not support speed changes directly this.audioQueue = []; this.isProcessingQueue = false; this.kokoroReady = false; this.kokoroInstance = null; // Store the KokoroTTS instance this.hasUserActivation = false; this.initializationPromise = null; this.audioContext = null; // For playing the generated audio this.currentVoice = "af_heart"; // Default voice from README this.currentAudioSource = null; // To keep track of the playing audio source // Start initialization process this.initializeKokoro(); } /** * Initialize Kokoro TTS by waiting for the class and then instantiating */ async initializeKokoro() { if (this.initializationPromise) { return this.initializationPromise; } this.initializationPromise = new Promise(async (resolve) => { try { // Wait for the KokoroTTS class to be loaded if (typeof window.KokoroTTS === 'undefined') { console.log('Kokoro TTS class not found, waiting for it to load...'); let loadTimeoutId = null; // Variable to hold the timeout ID const loadHandler = async () => { clearTimeout(loadTimeoutId); // <<< Clear the timeout window.removeEventListener('kokoro-class-loaded', loadHandler); window.removeEventListener('kokoro-class-load-failed', failHandler); console.log('KokoroTTS class loaded event received.'); const success = await this._initKokoroInstance(); resolve(success); }; const failHandler = () => { clearTimeout(loadTimeoutId); // <<< Clear the timeout window.removeEventListener('kokoro-class-loaded', loadHandler); window.removeEventListener('kokoro-class-load-failed', failHandler); console.error('KokoroTTS class failed to load.'); resolve(false); }; window.addEventListener('kokoro-class-loaded', loadHandler); window.addEventListener('kokoro-class-load-failed', failHandler); // Timeout if the event never fires loadTimeoutId = setTimeout(() => { // <<< Store the timeout ID // Check if still waiting (listener might have run but instance not ready yet) if (!this.kokoroInstance && !this.kokoroReady) { window.removeEventListener('kokoro-class-loaded', loadHandler); window.removeEventListener('kokoro-class-load-failed', failHandler); console.error('Timed out waiting for KokoroTTS class load event.'); resolve(false); } }, 15000); // Increased timeout return; } // If we get here, KokoroTTS class is already available console.log('KokoroTTS class found directly.'); const success = await this._initKokoroInstance(); resolve(success); } catch (error) { console.error('Error during KokoroHandler initialization:', error); resolve(false); } }); return this.initializationPromise; } /** * Internal method to create and initialize the KokoroTTS instance * @private */ async _initKokoroInstance() { if (this.kokoroInstance || this.kokoroReady) return true; // Already initialized or initializing try { console.log('Initializing KokoroTTS instance (GPU Only Attempt)...'); const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX"; // --- Check for WebGPU Support --- const device = await this.getBestDevice(); if (device !== 'webgpu') { console.warn('WebGPU not available or supported. Kokoro TTS (GPU) cannot be initialized.'); // Explicitly set ready to false and return false to signal failure this.kokoroReady = false; return false; } // --- End WebGPU Check --- // Use fp32 for WebGPU as recommended const dtype = 'fp32'; console.log(`Attempting KokoroTTS init with device: ${device}, dtype: ${dtype}`); console.log(`Calling KokoroTTS.from_pretrained('${model_id}', { dtype: '${dtype}', device: '${device}' })...`); // --- Add Timeout Wrapper for from_pretrained --- const fromPretrainedPromise = window.KokoroTTS.from_pretrained(model_id, { dtype: dtype, device: device, // Always 'webgpu' if we reach here }); const pretrainedTimeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error('KokoroTTS.from_pretrained (WebGPU) timed out after 55 seconds')), 55000) // 55 seconds timeout ); try { this.kokoroInstance = await Promise.race([ fromPretrainedPromise, pretrainedTimeoutPromise ]); } catch (timeoutError) { console.error(timeoutError.message); // Log the specific timeout error throw timeoutError; // Re-throw to be caught by the outer catch block } // --- End Timeout Wrapper --- console.log('KokoroTTS.from_pretrained call completed.'); if (!this.kokoroInstance) { console.error('KokoroTTS.from_pretrained returned a falsy value.'); throw new Error('KokoroTTS.from_pretrained returned null or undefined.'); } // Defer AudioContext creation until first use this.kokoroReady = true; console.log('Kokoro TTS (WebGPU) instance created successfully (AudioContext deferred).'); return true; } catch (error) { console.error('Error during KokoroTTS (WebGPU) initialization:', error); if (error.message) { console.error('Error message:', error.message); } if (error.stack) { console.error('Error stack:', error.stack); } this.kokoroInstance = null; this.kokoroReady = false; return false; // Ensure failure is explicitly returned } } /** * Determine the best device (webgpu or wasm) * Checks for WebGPU support. * @private */ async getBestDevice() { if (navigator.gpu) { try { // Request an adapter. If this succeeds, WebGPU is likely available. const adapter = await navigator.gpu.requestAdapter(); if (adapter) { console.log('WebGPU supported, selecting webgpu device.'); return 'webgpu'; } console.warn('WebGPU adapter request returned null.'); } catch (e) { console.warn('WebGPU adapter request failed:', e); } } console.log('WebGPU not supported or available, cannot use GPU for Kokoro.'); return 'wasm'; // Return wasm indicating GPU is not the best/available option } /** * List available voices (delegates to KokoroTTS instance) */ async listVoices() { if (!this.kokoroReady || !this.kokoroInstance) { console.warn('Kokoro not ready, cannot list voices.'); return []; } try { // The README uses tts.list_voices(), assuming it's a method on the instance if (typeof this.kokoroInstance.list_voices === 'function') { return await this.kokoroInstance.list_voices(); } else { console.warn('list_voices method not found on KokoroTTS instance. Returning default.'); // Fallback based on README examples return [{ name: 'af_heart', description: 'Default American Female' }]; } } catch (error) { console.error('Error listing Kokoro voices:', error); return []; } } /** * Set the voice to use * @param {string} voiceName - Name of the voice (e.g., 'af_heart') */ setVoice(voiceName) { this.currentVoice = voiceName; console.log(`Kokoro voice set to: ${voiceName}`); } /** * Toggle TTS functionality on/off * @returns {boolean} New state of TTS (enabled/disabled) */ toggle() { // Set user activation flag when toggle is called this.hasUserActivation = true; // --- Create AudioContext on first activation --- if (!this.audioContext) { try { this.audioContext = new (window.AudioContext || window.webkitAudioContext)(); console.log('AudioContext created on user activation.'); // Resume if context starts suspended if (this.audioContext.state === 'suspended') { this.audioContext.resume().catch(err => console.error('Error resuming initial AudioContext:', err)); } } catch (e) { console.error('Failed to create AudioContext:', e); // If AudioContext fails, Kokoro cannot play audio this.kokoroReady = false; return false; } } // --- End AudioContext Creation --- if (!this.kokoroReady) { console.warn('Kokoro TTS not ready yet'); // Optionally, trigger re-initialization or inform user return false; } this.enabled = !this.enabled; console.log("Kokoro TTS toggled:", this.enabled ? "ON" : "OFF"); // Stop any ongoing speech when disabling if (!this.enabled && (this.speaking || this.isProcessingQueue)) { this.stop(); } return this.enabled; } /** * Set the speech rate/speed * @param {number} speed - Speed multiplier (0.1 to 2.0) */ setSpeed(speed) { this.currentSpeed = Math.max(0.5, Math.min(2.0, speed)); } /** * Process text for better speech synthesis * @param {string} text - Text to process * @returns {string} - Processed text */ processTextForSpeech(text) { if (!text) return ""; // Remove markdown/formatting that would sound strange when read text = text.replace(/\*\*([^*]+)\*\*/g, '$1'); // Bold text = text.replace(/\*([^*]+)\*/g, '$1'); // Italic text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1'); // Links // Clean up any HTML tags text = text.replace(/<[^>]+>/g, ''); return text; } /** * Split text into digestible chunks for better TTS handling * @param {string} text - Text to split * @returns {string[]} - Array of text chunks */ splitTextIntoChunks(text) { if (!text) return []; // Split by sentence terminators, keeping the terminator with the chunk const sentenceRegex = /[^.!?]+[.!?]+/g; const sentences = text.match(sentenceRegex) || [text]; // Group sentences into chunks for better performance const chunks = []; let currentChunk = ''; for (const sentence of sentences) { // If adding this sentence would make the chunk too long, start a new chunk if (currentChunk.length + sentence.length > 500) { if (currentChunk) chunks.push(currentChunk); currentChunk = sentence; } else { currentChunk += sentence; } } // Add the last chunk if it's not empty if (currentChunk) chunks.push(currentChunk); return chunks; } /** * Process the speech queue using KokoroTTS * @private */ async processQueue() { // Ensure AudioContext is ready before processing if (!this.audioContext) { console.warn('AudioContext not available, cannot process Kokoro queue.'); this.isProcessingQueue = false; this.speaking = false; return; } // Ensure AudioContext is running if (this.audioContext.state === 'suspended') { await this.audioContext.resume().catch(err => console.error('Error resuming AudioContext for queue:', err)); } if (this.isProcessingQueue || this.audioQueue.length === 0 || !this.kokoroReady || !this.kokoroInstance) { if (this.audioQueue.length === 0) { this.speaking = false; // Ensure speaking flag is reset when queue is empty } // Reset processing flag if we exit early if (this.isProcessingQueue && this.audioQueue.length === 0) { this.isProcessingQueue = false; } return; } this.isProcessingQueue = true; this.speaking = true; // Set speaking true when processing starts try { const textChunk = this.audioQueue.shift(); if (!textChunk) { this.isProcessingQueue = false; this.speaking = false; return; } console.log(`Kokoro generating chunk (${this.audioQueue.length} remaining):`, textChunk.substring(0, 30) + "..."); try { // Use Kokoro instance to generate audio const audioResult = await this.kokoroInstance.generate(textChunk, { voice: this.currentVoice, }); // --- Updated Check: Expect Float32Array --- if (!audioResult || !audioResult.audio || !(audioResult.audio instanceof Float32Array) || !audioResult.sampling_rate) { console.error('Invalid audio data or sampling rate received from KokoroTTS.generate', audioResult); throw new Error('Invalid audio data or sampling rate received from KokoroTTS.generate'); } // --- End Updated Check --- const rawAudioSamples = audioResult.audio; const samplingRate = audioResult.sampling_rate; console.log(`Received raw audio samples (${rawAudioSamples.length}), sample rate: ${samplingRate}`); // Decode and play the raw audio samples await this.playRawAudio(rawAudioSamples, samplingRate); } catch (error) { console.error("Error generating or playing Kokoro speech:", error); } finally { // Always continue processing the queue this.isProcessingQueue = false; // Check if queue is now empty to reset speaking flag if (this.audioQueue.length === 0) { this.speaking = false; console.log("Kokoro queue finished."); } // Use setTimeout to avoid potential stack overflow on rapid processing setTimeout(() => this.processQueue(), 0); } } catch (error) { console.error("Error in Kokoro processQueue:", error); this.isProcessingQueue = false; this.speaking = false; // Reset speaking flag on error } } /** * Play raw Float32Array audio samples using Web Audio API * @param {Float32Array} samples - The raw audio samples * @param {number} sampleRate - The sample rate of the audio * @private */ async playRawAudio(samples, sampleRate) { if (!this.audioContext) { console.error('AudioContext not initialized.'); return; } if (this.audioContext.state === 'suspended') { await this.audioContext.resume().catch(err => console.error('Error resuming AudioContext for playback:', err)); } try { // Create an AudioBuffer const audioBuffer = this.audioContext.createBuffer( 1, // Number of channels (assuming mono) samples.length, // Length of the buffer sampleRate // Sample rate ); // Copy the samples to the AudioBuffer // NOTE: If audio is stereo, this needs adjustment audioBuffer.copyToChannel(samples, 0); // Create a source node const source = this.audioContext.createBufferSource(); source.buffer = audioBuffer; source.connect(this.audioContext.destination); // Store the current source to allow stopping this.currentAudioSource = source; console.log(`Playing audio buffer (${(samples.length / sampleRate).toFixed(2)}s)`); return new Promise((resolve) => { source.onended = () => { // Check if this source was the one we intended to stop if (this.currentAudioSource === source) { this.currentAudioSource = null; } console.log('Audio playback finished.'); resolve(); }; source.start(0); // Start playback immediately }); } catch (error) { console.error('Error creating or playing raw audio buffer:', error); this.currentAudioSource = null; // Clear source on error } } /** * Speak the provided text using KokoroTTS * @param {string} text - Text to be spoken * @param {function} onEndCallback - Callback when all speech ends */ speak(text, onEndCallback = null) { if (!this.enabled || !text) { if (onEndCallback) onEndCallback(); return; } // If kokoro is not ready yet, wait for initialization if (!this.kokoroReady) { console.warn("Kokoro TTS not ready yet, waiting for initialization..."); this.initializationPromise.then(success => { if (success && this.enabled) { this._speakInternal(text, onEndCallback); } else { console.error("Kokoro failed to initialize, cannot speak."); if (onEndCallback) onEndCallback(); } }); return; } this._speakInternal(text, onEndCallback); } /** * Internal method to handle speech after initialization checks * @private */ _speakInternal(text, onEndCallback) { // Ensure AudioContext is resumed after user interaction if (this.audioContext && this.audioContext.state === 'suspended') { this.audioContext.resume().catch(err => console.error('Error resuming AudioContext:', err)); } // Don't attempt to speak without user activation if (!this.hasUserActivation) { console.warn("Not attempting to speak because there hasn't been user interaction yet"); if (onEndCallback) onEndCallback(); return; } try { const processedText = this.processTextForSpeech(text); console.log("Kokoro TTS attempting to speak:", processedText.substring(0, 50) + "..."); // Stop any existing speech this.stop(); // Split into manageable chunks (consider if Kokoro handles long text well) const chunks = this.splitTextIntoChunks(processedText); this.audioQueue = chunks; // Start processing the queue if (this.audioQueue.length > 0 && !this.isProcessingQueue) { this.processQueue(); } // Set up a completion callback if (onEndCallback) { const checkCompletion = () => { if (!this.isSpeaking()) { // Check if speaking is false onEndCallback(); } else { setTimeout(checkCompletion, 150); // Check again shortly } }; // Start checking slightly after processing begins setTimeout(checkCompletion, 100); } } catch (error) { console.error("Error in Kokoro speak:", error); if (onEndCallback) onEndCallback(); } } /** * Pause the current speech (Note: May not be perfectly resumable with AudioBufferSourceNode) */ pause() { if (!this.speaking || !this.audioContext) return; // Suspending AudioContext is a way to pause, but resuming might not be seamless this.audioContext.suspend().catch(err => console.error('Error suspending AudioContext:', err)); this.paused = true; console.log('Kokoro audio paused (via AudioContext suspend)'); } /** * Resume paused speech */ resume() { if (!this.paused || !this.audioContext) return; this.audioContext.resume().catch(err => console.error('Error resuming AudioContext:', err)); this.paused = false; console.log('Kokoro audio resumed (via AudioContext resume)'); } /** * Stop the current speech */ stop() { // Stop any currently playing audio source if (this.currentAudioSource) { try { this.currentAudioSource.stop(); } catch (e) { // Ignore errors if source already stopped } this.currentAudioSource = null; } // Clear the queue and reset flags this.audioQueue = []; this.isProcessingQueue = false; this.speaking = false; this.paused = false; console.log('Kokoro speech stopped and queue cleared.'); } /** * Check if TTS is currently active/enabled */ isEnabled() { return this.enabled && this.kokoroReady; } /** * Check if speech is currently in progress */ isSpeaking() { // Consider both the processing flag and if an audio source is active return this.speaking || this.isProcessingQueue || !!this.currentAudioSource; } } // Don't create a global instance here - the factory will do this // const ttsHandler = new KokoroHandler();