feat: Integrate Kokoro TTS with WebGPU and fallback
This commit is contained in:
@@ -0,0 +1,597 @@
|
||||
/**
|
||||
* Kokoro Text-to-Speech Handler for AI Interactive Fiction
|
||||
* Uses the kokoro-js library for high-quality TTS
|
||||
*/
|
||||
|
||||
class KokoroHandler {
|
||||
constructor() {
|
||||
this.enabled = false;
|
||||
this.speaking = false;
|
||||
this.paused = false;
|
||||
this.audio = null;
|
||||
this.currentSpeed = 1.0; // Note: KokoroTTS might not support speed changes directly
|
||||
this.audioQueue = [];
|
||||
this.isProcessingQueue = false;
|
||||
this.kokoroReady = false;
|
||||
this.kokoroInstance = null; // Store the KokoroTTS instance
|
||||
this.hasUserActivation = false;
|
||||
this.initializationPromise = null;
|
||||
this.audioContext = null; // For playing the generated audio
|
||||
this.currentVoice = "af_heart"; // Default voice from README
|
||||
this.currentAudioSource = null; // To keep track of the playing audio source
|
||||
|
||||
// Start initialization process
|
||||
this.initializeKokoro();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize Kokoro TTS by waiting for the class and then instantiating
|
||||
*/
|
||||
async initializeKokoro() {
|
||||
if (this.initializationPromise) {
|
||||
return this.initializationPromise;
|
||||
}
|
||||
|
||||
this.initializationPromise = new Promise(async (resolve) => {
|
||||
try {
|
||||
// Wait for the KokoroTTS class to be loaded
|
||||
if (typeof window.KokoroTTS === 'undefined') {
|
||||
console.log('Kokoro TTS class not found, waiting for it to load...');
|
||||
|
||||
let loadTimeoutId = null; // Variable to hold the timeout ID
|
||||
|
||||
const loadHandler = async () => {
|
||||
clearTimeout(loadTimeoutId); // <<< Clear the timeout
|
||||
window.removeEventListener('kokoro-class-loaded', loadHandler);
|
||||
window.removeEventListener('kokoro-class-load-failed', failHandler);
|
||||
console.log('KokoroTTS class loaded event received.');
|
||||
const success = await this._initKokoroInstance();
|
||||
resolve(success);
|
||||
};
|
||||
|
||||
const failHandler = () => {
|
||||
clearTimeout(loadTimeoutId); // <<< Clear the timeout
|
||||
window.removeEventListener('kokoro-class-loaded', loadHandler);
|
||||
window.removeEventListener('kokoro-class-load-failed', failHandler);
|
||||
console.error('KokoroTTS class failed to load.');
|
||||
resolve(false);
|
||||
};
|
||||
|
||||
window.addEventListener('kokoro-class-loaded', loadHandler);
|
||||
window.addEventListener('kokoro-class-load-failed', failHandler);
|
||||
|
||||
// Timeout if the event never fires
|
||||
loadTimeoutId = setTimeout(() => { // <<< Store the timeout ID
|
||||
// Check if still waiting (listener might have run but instance not ready yet)
|
||||
if (!this.kokoroInstance && !this.kokoroReady) {
|
||||
window.removeEventListener('kokoro-class-loaded', loadHandler);
|
||||
window.removeEventListener('kokoro-class-load-failed', failHandler);
|
||||
console.error('Timed out waiting for KokoroTTS class load event.');
|
||||
resolve(false);
|
||||
}
|
||||
}, 15000); // Increased timeout
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// If we get here, KokoroTTS class is already available
|
||||
console.log('KokoroTTS class found directly.');
|
||||
const success = await this._initKokoroInstance();
|
||||
resolve(success);
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error during KokoroHandler initialization:', error);
|
||||
resolve(false);
|
||||
}
|
||||
});
|
||||
|
||||
return this.initializationPromise;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal method to create and initialize the KokoroTTS instance
|
||||
* @private
|
||||
*/
|
||||
async _initKokoroInstance() {
|
||||
if (this.kokoroInstance || this.kokoroReady) return true; // Already initialized or initializing
|
||||
|
||||
try {
|
||||
console.log('Initializing KokoroTTS instance (GPU Only Attempt)...');
|
||||
const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
|
||||
|
||||
// --- Check for WebGPU Support ---
|
||||
const device = await this.getBestDevice();
|
||||
if (device !== 'webgpu') {
|
||||
console.warn('WebGPU not available or supported. Kokoro TTS (GPU) cannot be initialized.');
|
||||
// Explicitly set ready to false and return false to signal failure
|
||||
this.kokoroReady = false;
|
||||
return false;
|
||||
}
|
||||
// --- End WebGPU Check ---
|
||||
|
||||
// Use fp32 for WebGPU as recommended
|
||||
const dtype = 'fp32';
|
||||
console.log(`Attempting KokoroTTS init with device: ${device}, dtype: ${dtype}`);
|
||||
|
||||
console.log(`Calling KokoroTTS.from_pretrained('${model_id}', { dtype: '${dtype}', device: '${device}' })...`);
|
||||
|
||||
// --- Add Timeout Wrapper for from_pretrained ---
|
||||
const fromPretrainedPromise = window.KokoroTTS.from_pretrained(model_id, {
|
||||
dtype: dtype,
|
||||
device: device, // Always 'webgpu' if we reach here
|
||||
});
|
||||
|
||||
const pretrainedTimeoutPromise = new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error('KokoroTTS.from_pretrained (WebGPU) timed out after 55 seconds')), 55000) // 55 seconds timeout
|
||||
);
|
||||
|
||||
try {
|
||||
this.kokoroInstance = await Promise.race([
|
||||
fromPretrainedPromise,
|
||||
pretrainedTimeoutPromise
|
||||
]);
|
||||
} catch (timeoutError) {
|
||||
console.error(timeoutError.message); // Log the specific timeout error
|
||||
throw timeoutError; // Re-throw to be caught by the outer catch block
|
||||
}
|
||||
// --- End Timeout Wrapper ---
|
||||
|
||||
console.log('KokoroTTS.from_pretrained call completed.');
|
||||
|
||||
if (!this.kokoroInstance) {
|
||||
console.error('KokoroTTS.from_pretrained returned a falsy value.');
|
||||
throw new Error('KokoroTTS.from_pretrained returned null or undefined.');
|
||||
}
|
||||
|
||||
// Defer AudioContext creation until first use
|
||||
|
||||
this.kokoroReady = true;
|
||||
console.log('Kokoro TTS (WebGPU) instance created successfully (AudioContext deferred).');
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error('Error during KokoroTTS (WebGPU) initialization:', error);
|
||||
if (error.message) {
|
||||
console.error('Error message:', error.message);
|
||||
}
|
||||
if (error.stack) {
|
||||
console.error('Error stack:', error.stack);
|
||||
}
|
||||
this.kokoroInstance = null;
|
||||
this.kokoroReady = false;
|
||||
return false; // Ensure failure is explicitly returned
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine the best device (webgpu or wasm)
|
||||
* Checks for WebGPU support.
|
||||
* @private
|
||||
*/
|
||||
async getBestDevice() {
|
||||
if (navigator.gpu) {
|
||||
try {
|
||||
// Request an adapter. If this succeeds, WebGPU is likely available.
|
||||
const adapter = await navigator.gpu.requestAdapter();
|
||||
if (adapter) {
|
||||
console.log('WebGPU supported, selecting webgpu device.');
|
||||
return 'webgpu';
|
||||
}
|
||||
console.warn('WebGPU adapter request returned null.');
|
||||
} catch (e) {
|
||||
console.warn('WebGPU adapter request failed:', e);
|
||||
}
|
||||
}
|
||||
console.log('WebGPU not supported or available, cannot use GPU for Kokoro.');
|
||||
return 'wasm'; // Return wasm indicating GPU is not the best/available option
|
||||
}
|
||||
|
||||
/**
|
||||
* List available voices (delegates to KokoroTTS instance)
|
||||
*/
|
||||
async listVoices() {
|
||||
if (!this.kokoroReady || !this.kokoroInstance) {
|
||||
console.warn('Kokoro not ready, cannot list voices.');
|
||||
return [];
|
||||
}
|
||||
try {
|
||||
// The README uses tts.list_voices(), assuming it's a method on the instance
|
||||
if (typeof this.kokoroInstance.list_voices === 'function') {
|
||||
return await this.kokoroInstance.list_voices();
|
||||
} else {
|
||||
console.warn('list_voices method not found on KokoroTTS instance. Returning default.');
|
||||
// Fallback based on README examples
|
||||
return [{ name: 'af_heart', description: 'Default American Female' }];
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error listing Kokoro voices:', error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the voice to use
|
||||
* @param {string} voiceName - Name of the voice (e.g., 'af_heart')
|
||||
*/
|
||||
setVoice(voiceName) {
|
||||
this.currentVoice = voiceName;
|
||||
console.log(`Kokoro voice set to: ${voiceName}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Toggle TTS functionality on/off
|
||||
* @returns {boolean} New state of TTS (enabled/disabled)
|
||||
*/
|
||||
toggle() {
|
||||
// Set user activation flag when toggle is called
|
||||
this.hasUserActivation = true;
|
||||
|
||||
// --- Create AudioContext on first activation ---
|
||||
if (!this.audioContext) {
|
||||
try {
|
||||
this.audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
console.log('AudioContext created on user activation.');
|
||||
// Resume if context starts suspended
|
||||
if (this.audioContext.state === 'suspended') {
|
||||
this.audioContext.resume().catch(err => console.error('Error resuming initial AudioContext:', err));
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Failed to create AudioContext:', e);
|
||||
// If AudioContext fails, Kokoro cannot play audio
|
||||
this.kokoroReady = false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// --- End AudioContext Creation ---
|
||||
|
||||
if (!this.kokoroReady) {
|
||||
console.warn('Kokoro TTS not ready yet');
|
||||
// Optionally, trigger re-initialization or inform user
|
||||
return false;
|
||||
}
|
||||
|
||||
this.enabled = !this.enabled;
|
||||
console.log("Kokoro TTS toggled:", this.enabled ? "ON" : "OFF");
|
||||
|
||||
// Stop any ongoing speech when disabling
|
||||
if (!this.enabled && (this.speaking || this.isProcessingQueue)) {
|
||||
this.stop();
|
||||
}
|
||||
|
||||
return this.enabled;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the speech rate/speed
|
||||
* @param {number} speed - Speed multiplier (0.1 to 2.0)
|
||||
*/
|
||||
setSpeed(speed) {
|
||||
this.currentSpeed = Math.max(0.5, Math.min(2.0, speed));
|
||||
}
|
||||
|
||||
/**
|
||||
* Process text for better speech synthesis
|
||||
* @param {string} text - Text to process
|
||||
* @returns {string} - Processed text
|
||||
*/
|
||||
processTextForSpeech(text) {
|
||||
if (!text) return "";
|
||||
|
||||
// Remove markdown/formatting that would sound strange when read
|
||||
text = text.replace(/\*\*([^*]+)\*\*/g, '$1'); // Bold
|
||||
text = text.replace(/\*([^*]+)\*/g, '$1'); // Italic
|
||||
text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1'); // Links
|
||||
|
||||
// Clean up any HTML tags
|
||||
text = text.replace(/<[^>]+>/g, '');
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split text into digestible chunks for better TTS handling
|
||||
* @param {string} text - Text to split
|
||||
* @returns {string[]} - Array of text chunks
|
||||
*/
|
||||
splitTextIntoChunks(text) {
|
||||
if (!text) return [];
|
||||
|
||||
// Split by sentence terminators, keeping the terminator with the chunk
|
||||
const sentenceRegex = /[^.!?]+[.!?]+/g;
|
||||
const sentences = text.match(sentenceRegex) || [text];
|
||||
|
||||
// Group sentences into chunks for better performance
|
||||
const chunks = [];
|
||||
let currentChunk = '';
|
||||
|
||||
for (const sentence of sentences) {
|
||||
// If adding this sentence would make the chunk too long, start a new chunk
|
||||
if (currentChunk.length + sentence.length > 500) {
|
||||
if (currentChunk) chunks.push(currentChunk);
|
||||
currentChunk = sentence;
|
||||
} else {
|
||||
currentChunk += sentence;
|
||||
}
|
||||
}
|
||||
|
||||
// Add the last chunk if it's not empty
|
||||
if (currentChunk) chunks.push(currentChunk);
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process the speech queue using KokoroTTS
|
||||
* @private
|
||||
*/
|
||||
async processQueue() {
|
||||
// Ensure AudioContext is ready before processing
|
||||
if (!this.audioContext) {
|
||||
console.warn('AudioContext not available, cannot process Kokoro queue.');
|
||||
this.isProcessingQueue = false;
|
||||
this.speaking = false;
|
||||
return;
|
||||
}
|
||||
// Ensure AudioContext is running
|
||||
if (this.audioContext.state === 'suspended') {
|
||||
await this.audioContext.resume().catch(err => console.error('Error resuming AudioContext for queue:', err));
|
||||
}
|
||||
|
||||
if (this.isProcessingQueue || this.audioQueue.length === 0 || !this.kokoroReady || !this.kokoroInstance) {
|
||||
if (this.audioQueue.length === 0) {
|
||||
this.speaking = false; // Ensure speaking flag is reset when queue is empty
|
||||
}
|
||||
// Reset processing flag if we exit early
|
||||
if (this.isProcessingQueue && this.audioQueue.length === 0) {
|
||||
this.isProcessingQueue = false;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
this.isProcessingQueue = true;
|
||||
this.speaking = true; // Set speaking true when processing starts
|
||||
|
||||
try {
|
||||
const textChunk = this.audioQueue.shift();
|
||||
|
||||
if (!textChunk) {
|
||||
this.isProcessingQueue = false;
|
||||
this.speaking = false;
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Kokoro generating chunk (${this.audioQueue.length} remaining):`, textChunk.substring(0, 30) + "...");
|
||||
|
||||
try {
|
||||
// Use Kokoro instance to generate audio
|
||||
const audioResult = await this.kokoroInstance.generate(textChunk, {
|
||||
voice: this.currentVoice,
|
||||
});
|
||||
|
||||
// --- Updated Check: Expect Float32Array ---
|
||||
if (!audioResult || !audioResult.audio || !(audioResult.audio instanceof Float32Array) || !audioResult.sampling_rate) {
|
||||
console.error('Invalid audio data or sampling rate received from KokoroTTS.generate', audioResult);
|
||||
throw new Error('Invalid audio data or sampling rate received from KokoroTTS.generate');
|
||||
}
|
||||
// --- End Updated Check ---
|
||||
|
||||
const rawAudioSamples = audioResult.audio;
|
||||
const samplingRate = audioResult.sampling_rate;
|
||||
console.log(`Received raw audio samples (${rawAudioSamples.length}), sample rate: ${samplingRate}`);
|
||||
|
||||
// Decode and play the raw audio samples
|
||||
await this.playRawAudio(rawAudioSamples, samplingRate);
|
||||
|
||||
} catch (error) {
|
||||
console.error("Error generating or playing Kokoro speech:", error);
|
||||
} finally {
|
||||
// Always continue processing the queue
|
||||
this.isProcessingQueue = false;
|
||||
// Check if queue is now empty to reset speaking flag
|
||||
if (this.audioQueue.length === 0) {
|
||||
this.speaking = false;
|
||||
console.log("Kokoro queue finished.");
|
||||
}
|
||||
// Use setTimeout to avoid potential stack overflow on rapid processing
|
||||
setTimeout(() => this.processQueue(), 0);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error in Kokoro processQueue:", error);
|
||||
this.isProcessingQueue = false;
|
||||
this.speaking = false; // Reset speaking flag on error
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Play raw Float32Array audio samples using Web Audio API
|
||||
* @param {Float32Array} samples - The raw audio samples
|
||||
* @param {number} sampleRate - The sample rate of the audio
|
||||
* @private
|
||||
*/
|
||||
async playRawAudio(samples, sampleRate) {
|
||||
if (!this.audioContext) {
|
||||
console.error('AudioContext not initialized.');
|
||||
return;
|
||||
}
|
||||
if (this.audioContext.state === 'suspended') {
|
||||
await this.audioContext.resume().catch(err => console.error('Error resuming AudioContext for playback:', err));
|
||||
}
|
||||
|
||||
try {
|
||||
// Create an AudioBuffer
|
||||
const audioBuffer = this.audioContext.createBuffer(
|
||||
1, // Number of channels (assuming mono)
|
||||
samples.length, // Length of the buffer
|
||||
sampleRate // Sample rate
|
||||
);
|
||||
|
||||
// Copy the samples to the AudioBuffer
|
||||
// NOTE: If audio is stereo, this needs adjustment
|
||||
audioBuffer.copyToChannel(samples, 0);
|
||||
|
||||
// Create a source node
|
||||
const source = this.audioContext.createBufferSource();
|
||||
source.buffer = audioBuffer;
|
||||
source.connect(this.audioContext.destination);
|
||||
|
||||
// Store the current source to allow stopping
|
||||
this.currentAudioSource = source;
|
||||
|
||||
console.log(`Playing audio buffer (${(samples.length / sampleRate).toFixed(2)}s)`);
|
||||
|
||||
return new Promise((resolve) => {
|
||||
source.onended = () => {
|
||||
// Check if this source was the one we intended to stop
|
||||
if (this.currentAudioSource === source) {
|
||||
this.currentAudioSource = null;
|
||||
}
|
||||
console.log('Audio playback finished.');
|
||||
resolve();
|
||||
};
|
||||
source.start(0); // Start playback immediately
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error creating or playing raw audio buffer:', error);
|
||||
this.currentAudioSource = null; // Clear source on error
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Speak the provided text using KokoroTTS
|
||||
* @param {string} text - Text to be spoken
|
||||
* @param {function} onEndCallback - Callback when all speech ends
|
||||
*/
|
||||
speak(text, onEndCallback = null) {
|
||||
if (!this.enabled || !text) {
|
||||
if (onEndCallback) onEndCallback();
|
||||
return;
|
||||
}
|
||||
|
||||
// If kokoro is not ready yet, wait for initialization
|
||||
if (!this.kokoroReady) {
|
||||
console.warn("Kokoro TTS not ready yet, waiting for initialization...");
|
||||
this.initializationPromise.then(success => {
|
||||
if (success && this.enabled) {
|
||||
this._speakInternal(text, onEndCallback);
|
||||
} else {
|
||||
console.error("Kokoro failed to initialize, cannot speak.");
|
||||
if (onEndCallback) onEndCallback();
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
this._speakInternal(text, onEndCallback);
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal method to handle speech after initialization checks
|
||||
* @private
|
||||
*/
|
||||
_speakInternal(text, onEndCallback) {
|
||||
// Ensure AudioContext is resumed after user interaction
|
||||
if (this.audioContext && this.audioContext.state === 'suspended') {
|
||||
this.audioContext.resume().catch(err => console.error('Error resuming AudioContext:', err));
|
||||
}
|
||||
|
||||
// Don't attempt to speak without user activation
|
||||
if (!this.hasUserActivation) {
|
||||
console.warn("Not attempting to speak because there hasn't been user interaction yet");
|
||||
if (onEndCallback) onEndCallback();
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const processedText = this.processTextForSpeech(text);
|
||||
console.log("Kokoro TTS attempting to speak:", processedText.substring(0, 50) + "...");
|
||||
|
||||
// Stop any existing speech
|
||||
this.stop();
|
||||
|
||||
// Split into manageable chunks (consider if Kokoro handles long text well)
|
||||
const chunks = this.splitTextIntoChunks(processedText);
|
||||
this.audioQueue = chunks;
|
||||
|
||||
// Start processing the queue
|
||||
if (this.audioQueue.length > 0 && !this.isProcessingQueue) {
|
||||
this.processQueue();
|
||||
}
|
||||
|
||||
// Set up a completion callback
|
||||
if (onEndCallback) {
|
||||
const checkCompletion = () => {
|
||||
if (!this.isSpeaking()) { // Check if speaking is false
|
||||
onEndCallback();
|
||||
} else {
|
||||
setTimeout(checkCompletion, 150); // Check again shortly
|
||||
}
|
||||
};
|
||||
// Start checking slightly after processing begins
|
||||
setTimeout(checkCompletion, 100);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error in Kokoro speak:", error);
|
||||
if (onEndCallback) onEndCallback();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Pause the current speech (Note: May not be perfectly resumable with AudioBufferSourceNode)
|
||||
*/
|
||||
pause() {
|
||||
if (!this.speaking || !this.audioContext) return;
|
||||
// Suspending AudioContext is a way to pause, but resuming might not be seamless
|
||||
this.audioContext.suspend().catch(err => console.error('Error suspending AudioContext:', err));
|
||||
this.paused = true;
|
||||
console.log('Kokoro audio paused (via AudioContext suspend)');
|
||||
}
|
||||
|
||||
/**
|
||||
* Resume paused speech
|
||||
*/
|
||||
resume() {
|
||||
if (!this.paused || !this.audioContext) return;
|
||||
this.audioContext.resume().catch(err => console.error('Error resuming AudioContext:', err));
|
||||
this.paused = false;
|
||||
console.log('Kokoro audio resumed (via AudioContext resume)');
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop the current speech
|
||||
*/
|
||||
stop() {
|
||||
// Stop any currently playing audio source
|
||||
if (this.currentAudioSource) {
|
||||
try {
|
||||
this.currentAudioSource.stop();
|
||||
} catch (e) {
|
||||
// Ignore errors if source already stopped
|
||||
}
|
||||
this.currentAudioSource = null;
|
||||
}
|
||||
|
||||
// Clear the queue and reset flags
|
||||
this.audioQueue = [];
|
||||
this.isProcessingQueue = false;
|
||||
this.speaking = false;
|
||||
this.paused = false;
|
||||
console.log('Kokoro speech stopped and queue cleared.');
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if TTS is currently active/enabled
|
||||
*/
|
||||
isEnabled() {
|
||||
return this.enabled && this.kokoroReady;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if speech is currently in progress
|
||||
*/
|
||||
isSpeaking() {
|
||||
// Consider both the processing flag and if an audio source is active
|
||||
return this.speaking || this.isProcessingQueue || !!this.currentAudioSource;
|
||||
}
|
||||
}
|
||||
|
||||
// Don't create a global instance here - the factory will do this
|
||||
// const ttsHandler = new KokoroHandler();
|
||||
Reference in New Issue
Block a user