597 lines
20 KiB
JavaScript
597 lines
20 KiB
JavaScript
/**
|
|
* Kokoro Text-to-Speech Handler for AI Interactive Fiction
|
|
* Uses the kokoro-js library for high-quality TTS
|
|
*/
|
|
|
|
class KokoroHandler {
|
|
constructor() {
|
|
this.enabled = false;
|
|
this.speaking = false;
|
|
this.paused = false;
|
|
this.audio = null;
|
|
this.currentSpeed = 1.0; // Note: KokoroTTS might not support speed changes directly
|
|
this.audioQueue = [];
|
|
this.isProcessingQueue = false;
|
|
this.kokoroReady = false;
|
|
this.kokoroInstance = null; // Store the KokoroTTS instance
|
|
this.hasUserActivation = false;
|
|
this.initializationPromise = null;
|
|
this.audioContext = null; // For playing the generated audio
|
|
this.currentVoice = "af_heart"; // Default voice from README
|
|
this.currentAudioSource = null; // To keep track of the playing audio source
|
|
|
|
// Start initialization process
|
|
this.initializeKokoro();
|
|
}
|
|
|
|
/**
|
|
* Initialize Kokoro TTS by waiting for the class and then instantiating
|
|
*/
|
|
async initializeKokoro() {
|
|
if (this.initializationPromise) {
|
|
return this.initializationPromise;
|
|
}
|
|
|
|
this.initializationPromise = new Promise(async (resolve) => {
|
|
try {
|
|
// Wait for the KokoroTTS class to be loaded
|
|
if (typeof window.KokoroTTS === 'undefined') {
|
|
console.log('Kokoro TTS class not found, waiting for it to load...');
|
|
|
|
let loadTimeoutId = null; // Variable to hold the timeout ID
|
|
|
|
const loadHandler = async () => {
|
|
clearTimeout(loadTimeoutId); // <<< Clear the timeout
|
|
window.removeEventListener('kokoro-class-loaded', loadHandler);
|
|
window.removeEventListener('kokoro-class-load-failed', failHandler);
|
|
console.log('KokoroTTS class loaded event received.');
|
|
const success = await this._initKokoroInstance();
|
|
resolve(success);
|
|
};
|
|
|
|
const failHandler = () => {
|
|
clearTimeout(loadTimeoutId); // <<< Clear the timeout
|
|
window.removeEventListener('kokoro-class-loaded', loadHandler);
|
|
window.removeEventListener('kokoro-class-load-failed', failHandler);
|
|
console.error('KokoroTTS class failed to load.');
|
|
resolve(false);
|
|
};
|
|
|
|
window.addEventListener('kokoro-class-loaded', loadHandler);
|
|
window.addEventListener('kokoro-class-load-failed', failHandler);
|
|
|
|
// Timeout if the event never fires
|
|
loadTimeoutId = setTimeout(() => { // <<< Store the timeout ID
|
|
// Check if still waiting (listener might have run but instance not ready yet)
|
|
if (!this.kokoroInstance && !this.kokoroReady) {
|
|
window.removeEventListener('kokoro-class-loaded', loadHandler);
|
|
window.removeEventListener('kokoro-class-load-failed', failHandler);
|
|
console.error('Timed out waiting for KokoroTTS class load event.');
|
|
resolve(false);
|
|
}
|
|
}, 15000); // Increased timeout
|
|
|
|
return;
|
|
}
|
|
|
|
// If we get here, KokoroTTS class is already available
|
|
console.log('KokoroTTS class found directly.');
|
|
const success = await this._initKokoroInstance();
|
|
resolve(success);
|
|
|
|
} catch (error) {
|
|
console.error('Error during KokoroHandler initialization:', error);
|
|
resolve(false);
|
|
}
|
|
});
|
|
|
|
return this.initializationPromise;
|
|
}
|
|
|
|
/**
|
|
* Internal method to create and initialize the KokoroTTS instance
|
|
* @private
|
|
*/
|
|
async _initKokoroInstance() {
|
|
if (this.kokoroInstance || this.kokoroReady) return true; // Already initialized or initializing
|
|
|
|
try {
|
|
console.log('Initializing KokoroTTS instance (GPU Only Attempt)...');
|
|
const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
|
|
|
|
// --- Check for WebGPU Support ---
|
|
const device = await this.getBestDevice();
|
|
if (device !== 'webgpu') {
|
|
console.warn('WebGPU not available or supported. Kokoro TTS (GPU) cannot be initialized.');
|
|
// Explicitly set ready to false and return false to signal failure
|
|
this.kokoroReady = false;
|
|
return false;
|
|
}
|
|
// --- End WebGPU Check ---
|
|
|
|
// Use fp32 for WebGPU as recommended
|
|
const dtype = 'fp32';
|
|
console.log(`Attempting KokoroTTS init with device: ${device}, dtype: ${dtype}`);
|
|
|
|
console.log(`Calling KokoroTTS.from_pretrained('${model_id}', { dtype: '${dtype}', device: '${device}' })...`);
|
|
|
|
// --- Add Timeout Wrapper for from_pretrained ---
|
|
const fromPretrainedPromise = window.KokoroTTS.from_pretrained(model_id, {
|
|
dtype: dtype,
|
|
device: device, // Always 'webgpu' if we reach here
|
|
});
|
|
|
|
const pretrainedTimeoutPromise = new Promise((_, reject) =>
|
|
setTimeout(() => reject(new Error('KokoroTTS.from_pretrained (WebGPU) timed out after 55 seconds')), 55000) // 55 seconds timeout
|
|
);
|
|
|
|
try {
|
|
this.kokoroInstance = await Promise.race([
|
|
fromPretrainedPromise,
|
|
pretrainedTimeoutPromise
|
|
]);
|
|
} catch (timeoutError) {
|
|
console.error(timeoutError.message); // Log the specific timeout error
|
|
throw timeoutError; // Re-throw to be caught by the outer catch block
|
|
}
|
|
// --- End Timeout Wrapper ---
|
|
|
|
console.log('KokoroTTS.from_pretrained call completed.');
|
|
|
|
if (!this.kokoroInstance) {
|
|
console.error('KokoroTTS.from_pretrained returned a falsy value.');
|
|
throw new Error('KokoroTTS.from_pretrained returned null or undefined.');
|
|
}
|
|
|
|
// Defer AudioContext creation until first use
|
|
|
|
this.kokoroReady = true;
|
|
console.log('Kokoro TTS (WebGPU) instance created successfully (AudioContext deferred).');
|
|
return true;
|
|
} catch (error) {
|
|
console.error('Error during KokoroTTS (WebGPU) initialization:', error);
|
|
if (error.message) {
|
|
console.error('Error message:', error.message);
|
|
}
|
|
if (error.stack) {
|
|
console.error('Error stack:', error.stack);
|
|
}
|
|
this.kokoroInstance = null;
|
|
this.kokoroReady = false;
|
|
return false; // Ensure failure is explicitly returned
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Determine the best device (webgpu or wasm)
|
|
* Checks for WebGPU support.
|
|
* @private
|
|
*/
|
|
async getBestDevice() {
|
|
if (navigator.gpu) {
|
|
try {
|
|
// Request an adapter. If this succeeds, WebGPU is likely available.
|
|
const adapter = await navigator.gpu.requestAdapter();
|
|
if (adapter) {
|
|
console.log('WebGPU supported, selecting webgpu device.');
|
|
return 'webgpu';
|
|
}
|
|
console.warn('WebGPU adapter request returned null.');
|
|
} catch (e) {
|
|
console.warn('WebGPU adapter request failed:', e);
|
|
}
|
|
}
|
|
console.log('WebGPU not supported or available, cannot use GPU for Kokoro.');
|
|
return 'wasm'; // Return wasm indicating GPU is not the best/available option
|
|
}
|
|
|
|
/**
|
|
* List available voices (delegates to KokoroTTS instance)
|
|
*/
|
|
async listVoices() {
|
|
if (!this.kokoroReady || !this.kokoroInstance) {
|
|
console.warn('Kokoro not ready, cannot list voices.');
|
|
return [];
|
|
}
|
|
try {
|
|
// The README uses tts.list_voices(), assuming it's a method on the instance
|
|
if (typeof this.kokoroInstance.list_voices === 'function') {
|
|
return await this.kokoroInstance.list_voices();
|
|
} else {
|
|
console.warn('list_voices method not found on KokoroTTS instance. Returning default.');
|
|
// Fallback based on README examples
|
|
return [{ name: 'af_heart', description: 'Default American Female' }];
|
|
}
|
|
} catch (error) {
|
|
console.error('Error listing Kokoro voices:', error);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Set the voice to use
|
|
* @param {string} voiceName - Name of the voice (e.g., 'af_heart')
|
|
*/
|
|
setVoice(voiceName) {
|
|
this.currentVoice = voiceName;
|
|
console.log(`Kokoro voice set to: ${voiceName}`);
|
|
}
|
|
|
|
/**
|
|
* Toggle TTS functionality on/off
|
|
* @returns {boolean} New state of TTS (enabled/disabled)
|
|
*/
|
|
toggle() {
|
|
// Set user activation flag when toggle is called
|
|
this.hasUserActivation = true;
|
|
|
|
// --- Create AudioContext on first activation ---
|
|
if (!this.audioContext) {
|
|
try {
|
|
this.audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
|
console.log('AudioContext created on user activation.');
|
|
// Resume if context starts suspended
|
|
if (this.audioContext.state === 'suspended') {
|
|
this.audioContext.resume().catch(err => console.error('Error resuming initial AudioContext:', err));
|
|
}
|
|
} catch (e) {
|
|
console.error('Failed to create AudioContext:', e);
|
|
// If AudioContext fails, Kokoro cannot play audio
|
|
this.kokoroReady = false;
|
|
return false;
|
|
}
|
|
}
|
|
// --- End AudioContext Creation ---
|
|
|
|
if (!this.kokoroReady) {
|
|
console.warn('Kokoro TTS not ready yet');
|
|
// Optionally, trigger re-initialization or inform user
|
|
return false;
|
|
}
|
|
|
|
this.enabled = !this.enabled;
|
|
console.log("Kokoro TTS toggled:", this.enabled ? "ON" : "OFF");
|
|
|
|
// Stop any ongoing speech when disabling
|
|
if (!this.enabled && (this.speaking || this.isProcessingQueue)) {
|
|
this.stop();
|
|
}
|
|
|
|
return this.enabled;
|
|
}
|
|
|
|
/**
|
|
* Set the speech rate/speed
|
|
* @param {number} speed - Speed multiplier (0.1 to 2.0)
|
|
*/
|
|
setSpeed(speed) {
|
|
this.currentSpeed = Math.max(0.5, Math.min(2.0, speed));
|
|
}
|
|
|
|
/**
|
|
* Process text for better speech synthesis
|
|
* @param {string} text - Text to process
|
|
* @returns {string} - Processed text
|
|
*/
|
|
processTextForSpeech(text) {
|
|
if (!text) return "";
|
|
|
|
// Remove markdown/formatting that would sound strange when read
|
|
text = text.replace(/\*\*([^*]+)\*\*/g, '$1'); // Bold
|
|
text = text.replace(/\*([^*]+)\*/g, '$1'); // Italic
|
|
text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1'); // Links
|
|
|
|
// Clean up any HTML tags
|
|
text = text.replace(/<[^>]+>/g, '');
|
|
|
|
return text;
|
|
}
|
|
|
|
/**
|
|
* Split text into digestible chunks for better TTS handling
|
|
* @param {string} text - Text to split
|
|
* @returns {string[]} - Array of text chunks
|
|
*/
|
|
splitTextIntoChunks(text) {
|
|
if (!text) return [];
|
|
|
|
// Split by sentence terminators, keeping the terminator with the chunk
|
|
const sentenceRegex = /[^.!?]+[.!?]+/g;
|
|
const sentences = text.match(sentenceRegex) || [text];
|
|
|
|
// Group sentences into chunks for better performance
|
|
const chunks = [];
|
|
let currentChunk = '';
|
|
|
|
for (const sentence of sentences) {
|
|
// If adding this sentence would make the chunk too long, start a new chunk
|
|
if (currentChunk.length + sentence.length > 500) {
|
|
if (currentChunk) chunks.push(currentChunk);
|
|
currentChunk = sentence;
|
|
} else {
|
|
currentChunk += sentence;
|
|
}
|
|
}
|
|
|
|
// Add the last chunk if it's not empty
|
|
if (currentChunk) chunks.push(currentChunk);
|
|
|
|
return chunks;
|
|
}
|
|
|
|
/**
|
|
* Process the speech queue using KokoroTTS
|
|
* @private
|
|
*/
|
|
async processQueue() {
|
|
// Ensure AudioContext is ready before processing
|
|
if (!this.audioContext) {
|
|
console.warn('AudioContext not available, cannot process Kokoro queue.');
|
|
this.isProcessingQueue = false;
|
|
this.speaking = false;
|
|
return;
|
|
}
|
|
// Ensure AudioContext is running
|
|
if (this.audioContext.state === 'suspended') {
|
|
await this.audioContext.resume().catch(err => console.error('Error resuming AudioContext for queue:', err));
|
|
}
|
|
|
|
if (this.isProcessingQueue || this.audioQueue.length === 0 || !this.kokoroReady || !this.kokoroInstance) {
|
|
if (this.audioQueue.length === 0) {
|
|
this.speaking = false; // Ensure speaking flag is reset when queue is empty
|
|
}
|
|
// Reset processing flag if we exit early
|
|
if (this.isProcessingQueue && this.audioQueue.length === 0) {
|
|
this.isProcessingQueue = false;
|
|
}
|
|
return;
|
|
}
|
|
|
|
this.isProcessingQueue = true;
|
|
this.speaking = true; // Set speaking true when processing starts
|
|
|
|
try {
|
|
const textChunk = this.audioQueue.shift();
|
|
|
|
if (!textChunk) {
|
|
this.isProcessingQueue = false;
|
|
this.speaking = false;
|
|
return;
|
|
}
|
|
|
|
console.log(`Kokoro generating chunk (${this.audioQueue.length} remaining):`, textChunk.substring(0, 30) + "...");
|
|
|
|
try {
|
|
// Use Kokoro instance to generate audio
|
|
const audioResult = await this.kokoroInstance.generate(textChunk, {
|
|
voice: this.currentVoice,
|
|
});
|
|
|
|
// --- Updated Check: Expect Float32Array ---
|
|
if (!audioResult || !audioResult.audio || !(audioResult.audio instanceof Float32Array) || !audioResult.sampling_rate) {
|
|
console.error('Invalid audio data or sampling rate received from KokoroTTS.generate', audioResult);
|
|
throw new Error('Invalid audio data or sampling rate received from KokoroTTS.generate');
|
|
}
|
|
// --- End Updated Check ---
|
|
|
|
const rawAudioSamples = audioResult.audio;
|
|
const samplingRate = audioResult.sampling_rate;
|
|
console.log(`Received raw audio samples (${rawAudioSamples.length}), sample rate: ${samplingRate}`);
|
|
|
|
// Decode and play the raw audio samples
|
|
await this.playRawAudio(rawAudioSamples, samplingRate);
|
|
|
|
} catch (error) {
|
|
console.error("Error generating or playing Kokoro speech:", error);
|
|
} finally {
|
|
// Always continue processing the queue
|
|
this.isProcessingQueue = false;
|
|
// Check if queue is now empty to reset speaking flag
|
|
if (this.audioQueue.length === 0) {
|
|
this.speaking = false;
|
|
console.log("Kokoro queue finished.");
|
|
}
|
|
// Use setTimeout to avoid potential stack overflow on rapid processing
|
|
setTimeout(() => this.processQueue(), 0);
|
|
}
|
|
} catch (error) {
|
|
console.error("Error in Kokoro processQueue:", error);
|
|
this.isProcessingQueue = false;
|
|
this.speaking = false; // Reset speaking flag on error
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Play raw Float32Array audio samples using Web Audio API
|
|
* @param {Float32Array} samples - The raw audio samples
|
|
* @param {number} sampleRate - The sample rate of the audio
|
|
* @private
|
|
*/
|
|
async playRawAudio(samples, sampleRate) {
|
|
if (!this.audioContext) {
|
|
console.error('AudioContext not initialized.');
|
|
return;
|
|
}
|
|
if (this.audioContext.state === 'suspended') {
|
|
await this.audioContext.resume().catch(err => console.error('Error resuming AudioContext for playback:', err));
|
|
}
|
|
|
|
try {
|
|
// Create an AudioBuffer
|
|
const audioBuffer = this.audioContext.createBuffer(
|
|
1, // Number of channels (assuming mono)
|
|
samples.length, // Length of the buffer
|
|
sampleRate // Sample rate
|
|
);
|
|
|
|
// Copy the samples to the AudioBuffer
|
|
// NOTE: If audio is stereo, this needs adjustment
|
|
audioBuffer.copyToChannel(samples, 0);
|
|
|
|
// Create a source node
|
|
const source = this.audioContext.createBufferSource();
|
|
source.buffer = audioBuffer;
|
|
source.connect(this.audioContext.destination);
|
|
|
|
// Store the current source to allow stopping
|
|
this.currentAudioSource = source;
|
|
|
|
console.log(`Playing audio buffer (${(samples.length / sampleRate).toFixed(2)}s)`);
|
|
|
|
return new Promise((resolve) => {
|
|
source.onended = () => {
|
|
// Check if this source was the one we intended to stop
|
|
if (this.currentAudioSource === source) {
|
|
this.currentAudioSource = null;
|
|
}
|
|
console.log('Audio playback finished.');
|
|
resolve();
|
|
};
|
|
source.start(0); // Start playback immediately
|
|
});
|
|
} catch (error) {
|
|
console.error('Error creating or playing raw audio buffer:', error);
|
|
this.currentAudioSource = null; // Clear source on error
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Speak the provided text using KokoroTTS
|
|
* @param {string} text - Text to be spoken
|
|
* @param {function} onEndCallback - Callback when all speech ends
|
|
*/
|
|
speak(text, onEndCallback = null) {
|
|
if (!this.enabled || !text) {
|
|
if (onEndCallback) onEndCallback();
|
|
return;
|
|
}
|
|
|
|
// If kokoro is not ready yet, wait for initialization
|
|
if (!this.kokoroReady) {
|
|
console.warn("Kokoro TTS not ready yet, waiting for initialization...");
|
|
this.initializationPromise.then(success => {
|
|
if (success && this.enabled) {
|
|
this._speakInternal(text, onEndCallback);
|
|
} else {
|
|
console.error("Kokoro failed to initialize, cannot speak.");
|
|
if (onEndCallback) onEndCallback();
|
|
}
|
|
});
|
|
return;
|
|
}
|
|
|
|
this._speakInternal(text, onEndCallback);
|
|
}
|
|
|
|
/**
|
|
* Internal method to handle speech after initialization checks
|
|
* @private
|
|
*/
|
|
_speakInternal(text, onEndCallback) {
|
|
// Ensure AudioContext is resumed after user interaction
|
|
if (this.audioContext && this.audioContext.state === 'suspended') {
|
|
this.audioContext.resume().catch(err => console.error('Error resuming AudioContext:', err));
|
|
}
|
|
|
|
// Don't attempt to speak without user activation
|
|
if (!this.hasUserActivation) {
|
|
console.warn("Not attempting to speak because there hasn't been user interaction yet");
|
|
if (onEndCallback) onEndCallback();
|
|
return;
|
|
}
|
|
|
|
try {
|
|
const processedText = this.processTextForSpeech(text);
|
|
console.log("Kokoro TTS attempting to speak:", processedText.substring(0, 50) + "...");
|
|
|
|
// Stop any existing speech
|
|
this.stop();
|
|
|
|
// Split into manageable chunks (consider if Kokoro handles long text well)
|
|
const chunks = this.splitTextIntoChunks(processedText);
|
|
this.audioQueue = chunks;
|
|
|
|
// Start processing the queue
|
|
if (this.audioQueue.length > 0 && !this.isProcessingQueue) {
|
|
this.processQueue();
|
|
}
|
|
|
|
// Set up a completion callback
|
|
if (onEndCallback) {
|
|
const checkCompletion = () => {
|
|
if (!this.isSpeaking()) { // Check if speaking is false
|
|
onEndCallback();
|
|
} else {
|
|
setTimeout(checkCompletion, 150); // Check again shortly
|
|
}
|
|
};
|
|
// Start checking slightly after processing begins
|
|
setTimeout(checkCompletion, 100);
|
|
}
|
|
} catch (error) {
|
|
console.error("Error in Kokoro speak:", error);
|
|
if (onEndCallback) onEndCallback();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Pause the current speech (Note: May not be perfectly resumable with AudioBufferSourceNode)
|
|
*/
|
|
pause() {
|
|
if (!this.speaking || !this.audioContext) return;
|
|
// Suspending AudioContext is a way to pause, but resuming might not be seamless
|
|
this.audioContext.suspend().catch(err => console.error('Error suspending AudioContext:', err));
|
|
this.paused = true;
|
|
console.log('Kokoro audio paused (via AudioContext suspend)');
|
|
}
|
|
|
|
/**
|
|
* Resume paused speech
|
|
*/
|
|
resume() {
|
|
if (!this.paused || !this.audioContext) return;
|
|
this.audioContext.resume().catch(err => console.error('Error resuming AudioContext:', err));
|
|
this.paused = false;
|
|
console.log('Kokoro audio resumed (via AudioContext resume)');
|
|
}
|
|
|
|
/**
|
|
* Stop the current speech
|
|
*/
|
|
stop() {
|
|
// Stop any currently playing audio source
|
|
if (this.currentAudioSource) {
|
|
try {
|
|
this.currentAudioSource.stop();
|
|
} catch (e) {
|
|
// Ignore errors if source already stopped
|
|
}
|
|
this.currentAudioSource = null;
|
|
}
|
|
|
|
// Clear the queue and reset flags
|
|
this.audioQueue = [];
|
|
this.isProcessingQueue = false;
|
|
this.speaking = false;
|
|
this.paused = false;
|
|
console.log('Kokoro speech stopped and queue cleared.');
|
|
}
|
|
|
|
/**
|
|
* Check if TTS is currently active/enabled
|
|
*/
|
|
isEnabled() {
|
|
return this.enabled && this.kokoroReady;
|
|
}
|
|
|
|
/**
|
|
* Check if speech is currently in progress
|
|
*/
|
|
isSpeaking() {
|
|
// Consider both the processing flag and if an audio source is active
|
|
return this.speaking || this.isProcessingQueue || !!this.currentAudioSource;
|
|
}
|
|
}
|
|
|
|
// Don't create a global instance here - the factory will do this
|
|
// const ttsHandler = new KokoroHandler();
|