feat: Integrate Kokoro TTS with WebGPU and fallback

This commit is contained in:
2025-04-01 10:34:24 +00:00
parent 113e3b995d
commit 1882acac8c
111 changed files with 9143 additions and 4447 deletions
+597
View File
@@ -0,0 +1,597 @@
/**
* Kokoro Text-to-Speech Handler for AI Interactive Fiction
* Uses the kokoro-js library for high-quality TTS
*/
class KokoroHandler {
constructor() {
this.enabled = false;
this.speaking = false;
this.paused = false;
this.audio = null;
this.currentSpeed = 1.0; // Note: KokoroTTS might not support speed changes directly
this.audioQueue = [];
this.isProcessingQueue = false;
this.kokoroReady = false;
this.kokoroInstance = null; // Store the KokoroTTS instance
this.hasUserActivation = false;
this.initializationPromise = null;
this.audioContext = null; // For playing the generated audio
this.currentVoice = "af_heart"; // Default voice from README
this.currentAudioSource = null; // To keep track of the playing audio source
// Start initialization process
this.initializeKokoro();
}
/**
* Initialize Kokoro TTS by waiting for the class and then instantiating
*/
async initializeKokoro() {
if (this.initializationPromise) {
return this.initializationPromise;
}
this.initializationPromise = new Promise(async (resolve) => {
try {
// Wait for the KokoroTTS class to be loaded
if (typeof window.KokoroTTS === 'undefined') {
console.log('Kokoro TTS class not found, waiting for it to load...');
let loadTimeoutId = null; // Variable to hold the timeout ID
const loadHandler = async () => {
clearTimeout(loadTimeoutId); // <<< Clear the timeout
window.removeEventListener('kokoro-class-loaded', loadHandler);
window.removeEventListener('kokoro-class-load-failed', failHandler);
console.log('KokoroTTS class loaded event received.');
const success = await this._initKokoroInstance();
resolve(success);
};
const failHandler = () => {
clearTimeout(loadTimeoutId); // <<< Clear the timeout
window.removeEventListener('kokoro-class-loaded', loadHandler);
window.removeEventListener('kokoro-class-load-failed', failHandler);
console.error('KokoroTTS class failed to load.');
resolve(false);
};
window.addEventListener('kokoro-class-loaded', loadHandler);
window.addEventListener('kokoro-class-load-failed', failHandler);
// Timeout if the event never fires
loadTimeoutId = setTimeout(() => { // <<< Store the timeout ID
// Check if still waiting (listener might have run but instance not ready yet)
if (!this.kokoroInstance && !this.kokoroReady) {
window.removeEventListener('kokoro-class-loaded', loadHandler);
window.removeEventListener('kokoro-class-load-failed', failHandler);
console.error('Timed out waiting for KokoroTTS class load event.');
resolve(false);
}
}, 15000); // Increased timeout
return;
}
// If we get here, KokoroTTS class is already available
console.log('KokoroTTS class found directly.');
const success = await this._initKokoroInstance();
resolve(success);
} catch (error) {
console.error('Error during KokoroHandler initialization:', error);
resolve(false);
}
});
return this.initializationPromise;
}
/**
* Internal method to create and initialize the KokoroTTS instance
* @private
*/
async _initKokoroInstance() {
if (this.kokoroInstance || this.kokoroReady) return true; // Already initialized or initializing
try {
console.log('Initializing KokoroTTS instance (GPU Only Attempt)...');
const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
// --- Check for WebGPU Support ---
const device = await this.getBestDevice();
if (device !== 'webgpu') {
console.warn('WebGPU not available or supported. Kokoro TTS (GPU) cannot be initialized.');
// Explicitly set ready to false and return false to signal failure
this.kokoroReady = false;
return false;
}
// --- End WebGPU Check ---
// Use fp32 for WebGPU as recommended
const dtype = 'fp32';
console.log(`Attempting KokoroTTS init with device: ${device}, dtype: ${dtype}`);
console.log(`Calling KokoroTTS.from_pretrained('${model_id}', { dtype: '${dtype}', device: '${device}' })...`);
// --- Add Timeout Wrapper for from_pretrained ---
const fromPretrainedPromise = window.KokoroTTS.from_pretrained(model_id, {
dtype: dtype,
device: device, // Always 'webgpu' if we reach here
});
const pretrainedTimeoutPromise = new Promise((_, reject) =>
setTimeout(() => reject(new Error('KokoroTTS.from_pretrained (WebGPU) timed out after 55 seconds')), 55000) // 55 seconds timeout
);
try {
this.kokoroInstance = await Promise.race([
fromPretrainedPromise,
pretrainedTimeoutPromise
]);
} catch (timeoutError) {
console.error(timeoutError.message); // Log the specific timeout error
throw timeoutError; // Re-throw to be caught by the outer catch block
}
// --- End Timeout Wrapper ---
console.log('KokoroTTS.from_pretrained call completed.');
if (!this.kokoroInstance) {
console.error('KokoroTTS.from_pretrained returned a falsy value.');
throw new Error('KokoroTTS.from_pretrained returned null or undefined.');
}
// Defer AudioContext creation until first use
this.kokoroReady = true;
console.log('Kokoro TTS (WebGPU) instance created successfully (AudioContext deferred).');
return true;
} catch (error) {
console.error('Error during KokoroTTS (WebGPU) initialization:', error);
if (error.message) {
console.error('Error message:', error.message);
}
if (error.stack) {
console.error('Error stack:', error.stack);
}
this.kokoroInstance = null;
this.kokoroReady = false;
return false; // Ensure failure is explicitly returned
}
}
/**
* Determine the best device (webgpu or wasm)
* Checks for WebGPU support.
* @private
*/
async getBestDevice() {
if (navigator.gpu) {
try {
// Request an adapter. If this succeeds, WebGPU is likely available.
const adapter = await navigator.gpu.requestAdapter();
if (adapter) {
console.log('WebGPU supported, selecting webgpu device.');
return 'webgpu';
}
console.warn('WebGPU adapter request returned null.');
} catch (e) {
console.warn('WebGPU adapter request failed:', e);
}
}
console.log('WebGPU not supported or available, cannot use GPU for Kokoro.');
return 'wasm'; // Return wasm indicating GPU is not the best/available option
}
/**
* List available voices (delegates to KokoroTTS instance)
*/
async listVoices() {
if (!this.kokoroReady || !this.kokoroInstance) {
console.warn('Kokoro not ready, cannot list voices.');
return [];
}
try {
// The README uses tts.list_voices(), assuming it's a method on the instance
if (typeof this.kokoroInstance.list_voices === 'function') {
return await this.kokoroInstance.list_voices();
} else {
console.warn('list_voices method not found on KokoroTTS instance. Returning default.');
// Fallback based on README examples
return [{ name: 'af_heart', description: 'Default American Female' }];
}
} catch (error) {
console.error('Error listing Kokoro voices:', error);
return [];
}
}
/**
* Set the voice to use
* @param {string} voiceName - Name of the voice (e.g., 'af_heart')
*/
setVoice(voiceName) {
this.currentVoice = voiceName;
console.log(`Kokoro voice set to: ${voiceName}`);
}
/**
* Toggle TTS functionality on/off
* @returns {boolean} New state of TTS (enabled/disabled)
*/
toggle() {
// Set user activation flag when toggle is called
this.hasUserActivation = true;
// --- Create AudioContext on first activation ---
if (!this.audioContext) {
try {
this.audioContext = new (window.AudioContext || window.webkitAudioContext)();
console.log('AudioContext created on user activation.');
// Resume if context starts suspended
if (this.audioContext.state === 'suspended') {
this.audioContext.resume().catch(err => console.error('Error resuming initial AudioContext:', err));
}
} catch (e) {
console.error('Failed to create AudioContext:', e);
// If AudioContext fails, Kokoro cannot play audio
this.kokoroReady = false;
return false;
}
}
// --- End AudioContext Creation ---
if (!this.kokoroReady) {
console.warn('Kokoro TTS not ready yet');
// Optionally, trigger re-initialization or inform user
return false;
}
this.enabled = !this.enabled;
console.log("Kokoro TTS toggled:", this.enabled ? "ON" : "OFF");
// Stop any ongoing speech when disabling
if (!this.enabled && (this.speaking || this.isProcessingQueue)) {
this.stop();
}
return this.enabled;
}
/**
* Set the speech rate/speed
* @param {number} speed - Speed multiplier (0.1 to 2.0)
*/
setSpeed(speed) {
this.currentSpeed = Math.max(0.5, Math.min(2.0, speed));
}
/**
* Process text for better speech synthesis
* @param {string} text - Text to process
* @returns {string} - Processed text
*/
processTextForSpeech(text) {
if (!text) return "";
// Remove markdown/formatting that would sound strange when read
text = text.replace(/\*\*([^*]+)\*\*/g, '$1'); // Bold
text = text.replace(/\*([^*]+)\*/g, '$1'); // Italic
text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1'); // Links
// Clean up any HTML tags
text = text.replace(/<[^>]+>/g, '');
return text;
}
/**
* Split text into digestible chunks for better TTS handling
* @param {string} text - Text to split
* @returns {string[]} - Array of text chunks
*/
splitTextIntoChunks(text) {
if (!text) return [];
// Split by sentence terminators, keeping the terminator with the chunk
const sentenceRegex = /[^.!?]+[.!?]+/g;
const sentences = text.match(sentenceRegex) || [text];
// Group sentences into chunks for better performance
const chunks = [];
let currentChunk = '';
for (const sentence of sentences) {
// If adding this sentence would make the chunk too long, start a new chunk
if (currentChunk.length + sentence.length > 500) {
if (currentChunk) chunks.push(currentChunk);
currentChunk = sentence;
} else {
currentChunk += sentence;
}
}
// Add the last chunk if it's not empty
if (currentChunk) chunks.push(currentChunk);
return chunks;
}
/**
* Process the speech queue using KokoroTTS
* @private
*/
async processQueue() {
// Ensure AudioContext is ready before processing
if (!this.audioContext) {
console.warn('AudioContext not available, cannot process Kokoro queue.');
this.isProcessingQueue = false;
this.speaking = false;
return;
}
// Ensure AudioContext is running
if (this.audioContext.state === 'suspended') {
await this.audioContext.resume().catch(err => console.error('Error resuming AudioContext for queue:', err));
}
if (this.isProcessingQueue || this.audioQueue.length === 0 || !this.kokoroReady || !this.kokoroInstance) {
if (this.audioQueue.length === 0) {
this.speaking = false; // Ensure speaking flag is reset when queue is empty
}
// Reset processing flag if we exit early
if (this.isProcessingQueue && this.audioQueue.length === 0) {
this.isProcessingQueue = false;
}
return;
}
this.isProcessingQueue = true;
this.speaking = true; // Set speaking true when processing starts
try {
const textChunk = this.audioQueue.shift();
if (!textChunk) {
this.isProcessingQueue = false;
this.speaking = false;
return;
}
console.log(`Kokoro generating chunk (${this.audioQueue.length} remaining):`, textChunk.substring(0, 30) + "...");
try {
// Use Kokoro instance to generate audio
const audioResult = await this.kokoroInstance.generate(textChunk, {
voice: this.currentVoice,
});
// --- Updated Check: Expect Float32Array ---
if (!audioResult || !audioResult.audio || !(audioResult.audio instanceof Float32Array) || !audioResult.sampling_rate) {
console.error('Invalid audio data or sampling rate received from KokoroTTS.generate', audioResult);
throw new Error('Invalid audio data or sampling rate received from KokoroTTS.generate');
}
// --- End Updated Check ---
const rawAudioSamples = audioResult.audio;
const samplingRate = audioResult.sampling_rate;
console.log(`Received raw audio samples (${rawAudioSamples.length}), sample rate: ${samplingRate}`);
// Decode and play the raw audio samples
await this.playRawAudio(rawAudioSamples, samplingRate);
} catch (error) {
console.error("Error generating or playing Kokoro speech:", error);
} finally {
// Always continue processing the queue
this.isProcessingQueue = false;
// Check if queue is now empty to reset speaking flag
if (this.audioQueue.length === 0) {
this.speaking = false;
console.log("Kokoro queue finished.");
}
// Use setTimeout to avoid potential stack overflow on rapid processing
setTimeout(() => this.processQueue(), 0);
}
} catch (error) {
console.error("Error in Kokoro processQueue:", error);
this.isProcessingQueue = false;
this.speaking = false; // Reset speaking flag on error
}
}
/**
* Play raw Float32Array audio samples using Web Audio API
* @param {Float32Array} samples - The raw audio samples
* @param {number} sampleRate - The sample rate of the audio
* @private
*/
async playRawAudio(samples, sampleRate) {
if (!this.audioContext) {
console.error('AudioContext not initialized.');
return;
}
if (this.audioContext.state === 'suspended') {
await this.audioContext.resume().catch(err => console.error('Error resuming AudioContext for playback:', err));
}
try {
// Create an AudioBuffer
const audioBuffer = this.audioContext.createBuffer(
1, // Number of channels (assuming mono)
samples.length, // Length of the buffer
sampleRate // Sample rate
);
// Copy the samples to the AudioBuffer
// NOTE: If audio is stereo, this needs adjustment
audioBuffer.copyToChannel(samples, 0);
// Create a source node
const source = this.audioContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(this.audioContext.destination);
// Store the current source to allow stopping
this.currentAudioSource = source;
console.log(`Playing audio buffer (${(samples.length / sampleRate).toFixed(2)}s)`);
return new Promise((resolve) => {
source.onended = () => {
// Check if this source was the one we intended to stop
if (this.currentAudioSource === source) {
this.currentAudioSource = null;
}
console.log('Audio playback finished.');
resolve();
};
source.start(0); // Start playback immediately
});
} catch (error) {
console.error('Error creating or playing raw audio buffer:', error);
this.currentAudioSource = null; // Clear source on error
}
}
/**
* Speak the provided text using KokoroTTS
* @param {string} text - Text to be spoken
* @param {function} onEndCallback - Callback when all speech ends
*/
speak(text, onEndCallback = null) {
if (!this.enabled || !text) {
if (onEndCallback) onEndCallback();
return;
}
// If kokoro is not ready yet, wait for initialization
if (!this.kokoroReady) {
console.warn("Kokoro TTS not ready yet, waiting for initialization...");
this.initializationPromise.then(success => {
if (success && this.enabled) {
this._speakInternal(text, onEndCallback);
} else {
console.error("Kokoro failed to initialize, cannot speak.");
if (onEndCallback) onEndCallback();
}
});
return;
}
this._speakInternal(text, onEndCallback);
}
/**
* Internal method to handle speech after initialization checks
* @private
*/
_speakInternal(text, onEndCallback) {
// Ensure AudioContext is resumed after user interaction
if (this.audioContext && this.audioContext.state === 'suspended') {
this.audioContext.resume().catch(err => console.error('Error resuming AudioContext:', err));
}
// Don't attempt to speak without user activation
if (!this.hasUserActivation) {
console.warn("Not attempting to speak because there hasn't been user interaction yet");
if (onEndCallback) onEndCallback();
return;
}
try {
const processedText = this.processTextForSpeech(text);
console.log("Kokoro TTS attempting to speak:", processedText.substring(0, 50) + "...");
// Stop any existing speech
this.stop();
// Split into manageable chunks (consider if Kokoro handles long text well)
const chunks = this.splitTextIntoChunks(processedText);
this.audioQueue = chunks;
// Start processing the queue
if (this.audioQueue.length > 0 && !this.isProcessingQueue) {
this.processQueue();
}
// Set up a completion callback
if (onEndCallback) {
const checkCompletion = () => {
if (!this.isSpeaking()) { // Check if speaking is false
onEndCallback();
} else {
setTimeout(checkCompletion, 150); // Check again shortly
}
};
// Start checking slightly after processing begins
setTimeout(checkCompletion, 100);
}
} catch (error) {
console.error("Error in Kokoro speak:", error);
if (onEndCallback) onEndCallback();
}
}
/**
* Pause the current speech (Note: May not be perfectly resumable with AudioBufferSourceNode)
*/
pause() {
if (!this.speaking || !this.audioContext) return;
// Suspending AudioContext is a way to pause, but resuming might not be seamless
this.audioContext.suspend().catch(err => console.error('Error suspending AudioContext:', err));
this.paused = true;
console.log('Kokoro audio paused (via AudioContext suspend)');
}
/**
* Resume paused speech
*/
resume() {
if (!this.paused || !this.audioContext) return;
this.audioContext.resume().catch(err => console.error('Error resuming AudioContext:', err));
this.paused = false;
console.log('Kokoro audio resumed (via AudioContext resume)');
}
/**
* Stop the current speech
*/
stop() {
// Stop any currently playing audio source
if (this.currentAudioSource) {
try {
this.currentAudioSource.stop();
} catch (e) {
// Ignore errors if source already stopped
}
this.currentAudioSource = null;
}
// Clear the queue and reset flags
this.audioQueue = [];
this.isProcessingQueue = false;
this.speaking = false;
this.paused = false;
console.log('Kokoro speech stopped and queue cleared.');
}
/**
* Check if TTS is currently active/enabled
*/
isEnabled() {
return this.enabled && this.kokoroReady;
}
/**
* Check if speech is currently in progress
*/
isSpeaking() {
// Consider both the processing flag and if an audio source is active
return this.speaking || this.isProcessingQueue || !!this.currentAudioSource;
}
}
// Don't create a global instance here - the factory will do this
// const ttsHandler = new KokoroHandler();