feat: Integrate Kokoro TTS with WebGPU and fallback

2025-04-01 10:34:24 +00:00
parent 113e3b995d
commit 1882acac8c
111 changed files with 9143 additions and 4447 deletions
@@ -0,0 +1,597 @@
+/**
+ * Kokoro Text-to-Speech Handler for AI Interactive Fiction
+ * Uses the kokoro-js library for high-quality TTS
+ */
+
+class KokoroHandler {
+  constructor() {
+    this.enabled = false;
+    this.speaking = false;
+    this.paused = false;
+    this.audio = null;
+    this.currentSpeed = 1.0; // Note: KokoroTTS might not support speed changes directly
+    this.audioQueue = [];
+    this.isProcessingQueue = false;
+    this.kokoroReady = false;
+    this.kokoroInstance = null; // Store the KokoroTTS instance
+    this.hasUserActivation = false;
+    this.initializationPromise = null;
+    this.audioContext = null; // For playing the generated audio
+    this.currentVoice = "af_heart"; // Default voice from README
+    this.currentAudioSource = null; // To keep track of the playing audio source
+
+    // Start initialization process
+    this.initializeKokoro();
+  }
+
+  /**
+   * Initialize Kokoro TTS by waiting for the class and then instantiating
+   */
+  async initializeKokoro() {
+    if (this.initializationPromise) {
+      return this.initializationPromise;
+    }
+
+    this.initializationPromise = new Promise(async (resolve) => {
+      try {
+        // Wait for the KokoroTTS class to be loaded
+        if (typeof window.KokoroTTS === 'undefined') {
+          console.log('Kokoro TTS class not found, waiting for it to load...');
+
+          let loadTimeoutId = null; // Variable to hold the timeout ID
+
+          const loadHandler = async () => {
+            clearTimeout(loadTimeoutId); // <<< Clear the timeout
+            window.removeEventListener('kokoro-class-loaded', loadHandler);
+            window.removeEventListener('kokoro-class-load-failed', failHandler);
+            console.log('KokoroTTS class loaded event received.');
+            const success = await this._initKokoroInstance();
+            resolve(success);
+          };
+
+          const failHandler = () => {
+            clearTimeout(loadTimeoutId); // <<< Clear the timeout
+            window.removeEventListener('kokoro-class-loaded', loadHandler);
+            window.removeEventListener('kokoro-class-load-failed', failHandler);
+            console.error('KokoroTTS class failed to load.');
+            resolve(false);
+          };
+
+          window.addEventListener('kokoro-class-loaded', loadHandler);
+          window.addEventListener('kokoro-class-load-failed', failHandler);
+
+          // Timeout if the event never fires
+          loadTimeoutId = setTimeout(() => { // <<< Store the timeout ID
+            // Check if still waiting (listener might have run but instance not ready yet)
+            if (!this.kokoroInstance && !this.kokoroReady) {
+              window.removeEventListener('kokoro-class-loaded', loadHandler);
+              window.removeEventListener('kokoro-class-load-failed', failHandler);
+              console.error('Timed out waiting for KokoroTTS class load event.');
+              resolve(false);
+            }
+          }, 15000); // Increased timeout
+
+          return;
+        }
+
+        // If we get here, KokoroTTS class is already available
+        console.log('KokoroTTS class found directly.');
+        const success = await this._initKokoroInstance();
+        resolve(success);
+
+      } catch (error) {
+        console.error('Error during KokoroHandler initialization:', error);
+        resolve(false);
+      }
+    });
+
+    return this.initializationPromise;
+  }
+
+  /**
+   * Internal method to create and initialize the KokoroTTS instance
+   * @private
+   */
+  async _initKokoroInstance() {
+    if (this.kokoroInstance || this.kokoroReady) return true; // Already initialized or initializing
+
+    try {
+      console.log('Initializing KokoroTTS instance (GPU Only Attempt)...');
+      const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
+
+      // --- Check for WebGPU Support ---
+      const device = await this.getBestDevice();
+      if (device !== 'webgpu') {
+        console.warn('WebGPU not available or supported. Kokoro TTS (GPU) cannot be initialized.');
+        // Explicitly set ready to false and return false to signal failure
+        this.kokoroReady = false;
+        return false;
+      }
+      // --- End WebGPU Check ---
+
+      // Use fp32 for WebGPU as recommended
+      const dtype = 'fp32';
+      console.log(`Attempting KokoroTTS init with device: ${device}, dtype: ${dtype}`);
+
+      console.log(`Calling KokoroTTS.from_pretrained('${model_id}', { dtype: '${dtype}', device: '${device}' })...`);
+
+      // --- Add Timeout Wrapper for from_pretrained ---
+      const fromPretrainedPromise = window.KokoroTTS.from_pretrained(model_id, {
+        dtype: dtype,
+        device: device, // Always 'webgpu' if we reach here
+      });
+
+      const pretrainedTimeoutPromise = new Promise((_, reject) =>
+        setTimeout(() => reject(new Error('KokoroTTS.from_pretrained (WebGPU) timed out after 55 seconds')), 55000) // 55 seconds timeout
+      );
+
+      try {
+        this.kokoroInstance = await Promise.race([
+          fromPretrainedPromise,
+          pretrainedTimeoutPromise
+        ]);
+      } catch (timeoutError) {
+        console.error(timeoutError.message); // Log the specific timeout error
+        throw timeoutError; // Re-throw to be caught by the outer catch block
+      }
+      // --- End Timeout Wrapper ---
+
+      console.log('KokoroTTS.from_pretrained call completed.');
+
+      if (!this.kokoroInstance) {
+        console.error('KokoroTTS.from_pretrained returned a falsy value.');
+        throw new Error('KokoroTTS.from_pretrained returned null or undefined.');
+      }
+
+      // Defer AudioContext creation until first use
+
+      this.kokoroReady = true;
+      console.log('Kokoro TTS (WebGPU) instance created successfully (AudioContext deferred).');
+      return true;
+    } catch (error) {
+      console.error('Error during KokoroTTS (WebGPU) initialization:', error);
+      if (error.message) {
+        console.error('Error message:', error.message);
+      }
+      if (error.stack) {
+        console.error('Error stack:', error.stack);
+      }
+      this.kokoroInstance = null;
+      this.kokoroReady = false;
+      return false; // Ensure failure is explicitly returned
+    }
+  }
+
+  /**
+   * Determine the best device (webgpu or wasm)
+   * Checks for WebGPU support.
+   * @private
+   */
+  async getBestDevice() {
+    if (navigator.gpu) {
+      try {
+        // Request an adapter. If this succeeds, WebGPU is likely available.
+        const adapter = await navigator.gpu.requestAdapter();
+        if (adapter) {
+          console.log('WebGPU supported, selecting webgpu device.');
+          return 'webgpu';
+        }
+        console.warn('WebGPU adapter request returned null.');
+      } catch (e) {
+        console.warn('WebGPU adapter request failed:', e);
+      }
+    }
+    console.log('WebGPU not supported or available, cannot use GPU for Kokoro.');
+    return 'wasm'; // Return wasm indicating GPU is not the best/available option
+  }
+
+  /**
+   * List available voices (delegates to KokoroTTS instance)
+   */
+  async listVoices() {
+    if (!this.kokoroReady || !this.kokoroInstance) {
+      console.warn('Kokoro not ready, cannot list voices.');
+      return [];
+    }
+    try {
+      // The README uses tts.list_voices(), assuming it's a method on the instance
+      if (typeof this.kokoroInstance.list_voices === 'function') {
+        return await this.kokoroInstance.list_voices();
+      } else {
+        console.warn('list_voices method not found on KokoroTTS instance. Returning default.');
+        // Fallback based on README examples
+        return [{ name: 'af_heart', description: 'Default American Female' }];
+      }
+    } catch (error) {
+      console.error('Error listing Kokoro voices:', error);
+      return [];
+    }
+  }
+
+  /**
+   * Set the voice to use
+   * @param {string} voiceName - Name of the voice (e.g., 'af_heart')
+   */
+  setVoice(voiceName) {
+    this.currentVoice = voiceName;
+    console.log(`Kokoro voice set to: ${voiceName}`);
+  }
+
+  /**
+   * Toggle TTS functionality on/off
+   * @returns {boolean} New state of TTS (enabled/disabled)
+   */
+  toggle() {
+    // Set user activation flag when toggle is called
+    this.hasUserActivation = true;
+
+    // --- Create AudioContext on first activation ---
+    if (!this.audioContext) {
+      try {
+        this.audioContext = new (window.AudioContext || window.webkitAudioContext)();
+        console.log('AudioContext created on user activation.');
+        // Resume if context starts suspended
+        if (this.audioContext.state === 'suspended') {
+          this.audioContext.resume().catch(err => console.error('Error resuming initial AudioContext:', err));
+        }
+      } catch (e) {
+        console.error('Failed to create AudioContext:', e);
+        // If AudioContext fails, Kokoro cannot play audio
+        this.kokoroReady = false;
+        return false;
+      }
+    }
+    // --- End AudioContext Creation ---
+
+    if (!this.kokoroReady) {
+      console.warn('Kokoro TTS not ready yet');
+      // Optionally, trigger re-initialization or inform user
+      return false;
+    }
+
+    this.enabled = !this.enabled;
+    console.log("Kokoro TTS toggled:", this.enabled ? "ON" : "OFF");
+
+    // Stop any ongoing speech when disabling
+    if (!this.enabled && (this.speaking || this.isProcessingQueue)) {
+      this.stop();
+    }
+
+    return this.enabled;
+  }
+
+  /**
+   * Set the speech rate/speed
+   * @param {number} speed - Speed multiplier (0.1 to 2.0)
+   */
+  setSpeed(speed) {
+    this.currentSpeed = Math.max(0.5, Math.min(2.0, speed));
+  }
+
+  /**
+   * Process text for better speech synthesis
+   * @param {string} text - Text to process
+   * @returns {string} - Processed text
+   */
+  processTextForSpeech(text) {
+    if (!text) return "";
+
+    // Remove markdown/formatting that would sound strange when read
+    text = text.replace(/\*\*([^*]+)\*\*/g, '$1'); // Bold
+    text = text.replace(/\*([^*]+)\*/g, '$1');     // Italic
+    text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1'); // Links
+
+    // Clean up any HTML tags
+    text = text.replace(/<[^>]+>/g, '');
+
+    return text;
+  }
+
+  /**
+   * Split text into digestible chunks for better TTS handling
+   * @param {string} text - Text to split
+   * @returns {string[]} - Array of text chunks
+   */
+  splitTextIntoChunks(text) {
+    if (!text) return [];
+
+    // Split by sentence terminators, keeping the terminator with the chunk
+    const sentenceRegex = /[^.!?]+[.!?]+/g;
+    const sentences = text.match(sentenceRegex) || [text];
+
+    // Group sentences into chunks for better performance
+    const chunks = [];
+    let currentChunk = '';
+
+    for (const sentence of sentences) {
+      // If adding this sentence would make the chunk too long, start a new chunk
+      if (currentChunk.length + sentence.length > 500) {
+        if (currentChunk) chunks.push(currentChunk);
+        currentChunk = sentence;
+      } else {
+        currentChunk += sentence;
+      }
+    }
+
+    // Add the last chunk if it's not empty
+    if (currentChunk) chunks.push(currentChunk);
+
+    return chunks;
+  }
+
+  /**
+   * Process the speech queue using KokoroTTS
+   * @private
+   */
+  async processQueue() {
+    // Ensure AudioContext is ready before processing
+    if (!this.audioContext) {
+      console.warn('AudioContext not available, cannot process Kokoro queue.');
+      this.isProcessingQueue = false;
+      this.speaking = false;
+      return;
+    }
+    // Ensure AudioContext is running
+    if (this.audioContext.state === 'suspended') {
+      await this.audioContext.resume().catch(err => console.error('Error resuming AudioContext for queue:', err));
+    }
+
+    if (this.isProcessingQueue || this.audioQueue.length === 0 || !this.kokoroReady || !this.kokoroInstance) {
+      if (this.audioQueue.length === 0) {
+        this.speaking = false; // Ensure speaking flag is reset when queue is empty
+      }
+      // Reset processing flag if we exit early
+      if (this.isProcessingQueue && this.audioQueue.length === 0) {
+        this.isProcessingQueue = false;
+      }
+      return;
+    }
+
+    this.isProcessingQueue = true;
+    this.speaking = true; // Set speaking true when processing starts
+
+    try {
+      const textChunk = this.audioQueue.shift();
+
+      if (!textChunk) {
+        this.isProcessingQueue = false;
+        this.speaking = false;
+        return;
+      }
+
+      console.log(`Kokoro generating chunk (${this.audioQueue.length} remaining):`, textChunk.substring(0, 30) + "...");
+
+      try {
+        // Use Kokoro instance to generate audio
+        const audioResult = await this.kokoroInstance.generate(textChunk, {
+          voice: this.currentVoice,
+        });
+
+        // --- Updated Check: Expect Float32Array ---
+        if (!audioResult || !audioResult.audio || !(audioResult.audio instanceof Float32Array) || !audioResult.sampling_rate) {
+          console.error('Invalid audio data or sampling rate received from KokoroTTS.generate', audioResult);
+          throw new Error('Invalid audio data or sampling rate received from KokoroTTS.generate');
+        }
+        // --- End Updated Check ---
+
+        const rawAudioSamples = audioResult.audio;
+        const samplingRate = audioResult.sampling_rate;
+        console.log(`Received raw audio samples (${rawAudioSamples.length}), sample rate: ${samplingRate}`);
+
+        // Decode and play the raw audio samples
+        await this.playRawAudio(rawAudioSamples, samplingRate);
+
+      } catch (error) {
+        console.error("Error generating or playing Kokoro speech:", error);
+      } finally {
+        // Always continue processing the queue
+        this.isProcessingQueue = false;
+        // Check if queue is now empty to reset speaking flag
+        if (this.audioQueue.length === 0) {
+          this.speaking = false;
+          console.log("Kokoro queue finished.");
+        }
+        // Use setTimeout to avoid potential stack overflow on rapid processing
+        setTimeout(() => this.processQueue(), 0);
+      }
+    } catch (error) {
+      console.error("Error in Kokoro processQueue:", error);
+      this.isProcessingQueue = false;
+      this.speaking = false; // Reset speaking flag on error
+    }
+  }
+
+  /**
+   * Play raw Float32Array audio samples using Web Audio API
+   * @param {Float32Array} samples - The raw audio samples
+   * @param {number} sampleRate - The sample rate of the audio
+   * @private
+   */
+  async playRawAudio(samples, sampleRate) {
+    if (!this.audioContext) {
+      console.error('AudioContext not initialized.');
+      return;
+    }
+    if (this.audioContext.state === 'suspended') {
+      await this.audioContext.resume().catch(err => console.error('Error resuming AudioContext for playback:', err));
+    }
+
+    try {
+      // Create an AudioBuffer
+      const audioBuffer = this.audioContext.createBuffer(
+        1, // Number of channels (assuming mono)
+        samples.length, // Length of the buffer
+        sampleRate // Sample rate
+      );
+
+      // Copy the samples to the AudioBuffer
+      // NOTE: If audio is stereo, this needs adjustment
+      audioBuffer.copyToChannel(samples, 0);
+
+      // Create a source node
+      const source = this.audioContext.createBufferSource();
+      source.buffer = audioBuffer;
+      source.connect(this.audioContext.destination);
+
+      // Store the current source to allow stopping
+      this.currentAudioSource = source;
+
+      console.log(`Playing audio buffer (${(samples.length / sampleRate).toFixed(2)}s)`);
+
+      return new Promise((resolve) => {
+        source.onended = () => {
+          // Check if this source was the one we intended to stop
+          if (this.currentAudioSource === source) {
+            this.currentAudioSource = null;
+          }
+          console.log('Audio playback finished.');
+          resolve();
+        };
+        source.start(0); // Start playback immediately
+      });
+    } catch (error) {
+      console.error('Error creating or playing raw audio buffer:', error);
+      this.currentAudioSource = null; // Clear source on error
+    }
+  }
+
+  /**
+   * Speak the provided text using KokoroTTS
+   * @param {string} text - Text to be spoken
+   * @param {function} onEndCallback - Callback when all speech ends
+   */
+  speak(text, onEndCallback = null) {
+    if (!this.enabled || !text) {
+      if (onEndCallback) onEndCallback();
+      return;
+    }
+
+    // If kokoro is not ready yet, wait for initialization
+    if (!this.kokoroReady) {
+      console.warn("Kokoro TTS not ready yet, waiting for initialization...");
+      this.initializationPromise.then(success => {
+        if (success && this.enabled) {
+          this._speakInternal(text, onEndCallback);
+        } else {
+          console.error("Kokoro failed to initialize, cannot speak.");
+          if (onEndCallback) onEndCallback();
+        }
+      });
+      return;
+    }
+
+    this._speakInternal(text, onEndCallback);
+  }
+
+  /**
+   * Internal method to handle speech after initialization checks
+   * @private
+   */
+  _speakInternal(text, onEndCallback) {
+    // Ensure AudioContext is resumed after user interaction
+    if (this.audioContext && this.audioContext.state === 'suspended') {
+      this.audioContext.resume().catch(err => console.error('Error resuming AudioContext:', err));
+    }
+
+    // Don't attempt to speak without user activation
+    if (!this.hasUserActivation) {
+      console.warn("Not attempting to speak because there hasn't been user interaction yet");
+      if (onEndCallback) onEndCallback();
+      return;
+    }
+
+    try {
+      const processedText = this.processTextForSpeech(text);
+      console.log("Kokoro TTS attempting to speak:", processedText.substring(0, 50) + "...");
+
+      // Stop any existing speech
+      this.stop();
+
+      // Split into manageable chunks (consider if Kokoro handles long text well)
+      const chunks = this.splitTextIntoChunks(processedText);
+      this.audioQueue = chunks;
+
+      // Start processing the queue
+      if (this.audioQueue.length > 0 && !this.isProcessingQueue) {
+        this.processQueue();
+      }
+
+      // Set up a completion callback
+      if (onEndCallback) {
+        const checkCompletion = () => {
+          if (!this.isSpeaking()) { // Check if speaking is false
+            onEndCallback();
+          } else {
+            setTimeout(checkCompletion, 150); // Check again shortly
+          }
+        };
+        // Start checking slightly after processing begins
+        setTimeout(checkCompletion, 100);
+      }
+    } catch (error) {
+      console.error("Error in Kokoro speak:", error);
+      if (onEndCallback) onEndCallback();
+    }
+  }
+
+  /**
+   * Pause the current speech (Note: May not be perfectly resumable with AudioBufferSourceNode)
+   */
+  pause() {
+    if (!this.speaking || !this.audioContext) return;
+    // Suspending AudioContext is a way to pause, but resuming might not be seamless
+    this.audioContext.suspend().catch(err => console.error('Error suspending AudioContext:', err));
+    this.paused = true;
+    console.log('Kokoro audio paused (via AudioContext suspend)');
+  }
+
+  /**
+   * Resume paused speech
+   */
+  resume() {
+    if (!this.paused || !this.audioContext) return;
+    this.audioContext.resume().catch(err => console.error('Error resuming AudioContext:', err));
+    this.paused = false;
+    console.log('Kokoro audio resumed (via AudioContext resume)');
+  }
+
+  /**
+   * Stop the current speech
+   */
+  stop() {
+    // Stop any currently playing audio source
+    if (this.currentAudioSource) {
+      try {
+        this.currentAudioSource.stop();
+      } catch (e) {
+        // Ignore errors if source already stopped
+      }
+      this.currentAudioSource = null;
+    }
+
+    // Clear the queue and reset flags
+    this.audioQueue = [];
+    this.isProcessingQueue = false;
+    this.speaking = false;
+    this.paused = false;
+    console.log('Kokoro speech stopped and queue cleared.');
+  }
+
+  /**
+   * Check if TTS is currently active/enabled
+   */
+  isEnabled() {
+    return this.enabled && this.kokoroReady;
+  }
+
+  /**
+   * Check if speech is currently in progress
+   */
+  isSpeaking() {
+    // Consider both the processing flag and if an audio source is active
+    return this.speaking || this.isProcessingQueue || !!this.currentAudioSource;
+  }
+}
+
+// Don't create a global instance here - the factory will do this
+// const ttsHandler = new KokoroHandler();