ai.interactive.fiction/public/js/kokoro-handler.js

/**
 * Kokoro Text-to-Speech Handler for AI Interactive Fiction
 * Uses the kokoro-js library for high-quality TTS
 */

class KokoroHandler {
  constructor() {
    this.enabled = false;
    this.speaking = false;
    this.paused = false;
    this.audio = null;
    this.currentSpeed = 1.0; // Note: KokoroTTS might not support speed changes directly
    this.audioQueue = [];
    this.isProcessingQueue = false;
    this.kokoroReady = false;
    this.kokoroInstance = null; // Store the KokoroTTS instance
    this.hasUserActivation = false;
    this.initializationPromise = null;
    this.audioContext = null; // For playing the generated audio
    this.currentVoice = "af_heart"; // Default voice from README
    this.currentAudioSource = null; // To keep track of the playing audio source

    // Start initialization process
    this.initializeKokoro();
  }

  /**
   * Initialize Kokoro TTS by waiting for the class and then instantiating
   */
  async initializeKokoro() {
    if (this.initializationPromise) {
      return this.initializationPromise;
    }

    this.initializationPromise = new Promise(async (resolve) => {
      try {
        // Wait for the KokoroTTS class to be loaded
        if (typeof window.KokoroTTS === 'undefined') {
          console.log('Kokoro TTS class not found, waiting for it to load...');

          let loadTimeoutId = null; // Variable to hold the timeout ID

          const loadHandler = async () => {
            clearTimeout(loadTimeoutId); // <<< Clear the timeout
            window.removeEventListener('kokoro-class-loaded', loadHandler);
            window.removeEventListener('kokoro-class-load-failed', failHandler);
            console.log('KokoroTTS class loaded event received.');
            const success = await this._initKokoroInstance();
            resolve(success);
          };

          const failHandler = () => {
            clearTimeout(loadTimeoutId); // <<< Clear the timeout
            window.removeEventListener('kokoro-class-loaded', loadHandler);
            window.removeEventListener('kokoro-class-load-failed', failHandler);
            console.error('KokoroTTS class failed to load.');
            resolve(false);
          };

          window.addEventListener('kokoro-class-loaded', loadHandler);
          window.addEventListener('kokoro-class-load-failed', failHandler);

          // Timeout if the event never fires
          loadTimeoutId = setTimeout(() => { // <<< Store the timeout ID
            // Check if still waiting (listener might have run but instance not ready yet)
            if (!this.kokoroInstance && !this.kokoroReady) {
              window.removeEventListener('kokoro-class-loaded', loadHandler);
              window.removeEventListener('kokoro-class-load-failed', failHandler);
              console.error('Timed out waiting for KokoroTTS class load event.');
              resolve(false);
            }
          }, 15000); // Increased timeout

          return;
        }

        // If we get here, KokoroTTS class is already available
        console.log('KokoroTTS class found directly.');
        const success = await this._initKokoroInstance();
        resolve(success);

      } catch (error) {
        console.error('Error during KokoroHandler initialization:', error);
        resolve(false);
      }
    });

    return this.initializationPromise;
  }

  /**
   * Internal method to create and initialize the KokoroTTS instance
   * @private
   */
  async _initKokoroInstance() {
    if (this.kokoroInstance || this.kokoroReady) return true; // Already initialized or initializing

    try {
      console.log('Initializing KokoroTTS instance (GPU Only Attempt)...');
      const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";

      // --- Check for WebGPU Support ---
      const device = await this.getBestDevice();
      if (device !== 'webgpu') {
        console.warn('WebGPU not available or supported. Kokoro TTS (GPU) cannot be initialized.');
        // Explicitly set ready to false and return false to signal failure
        this.kokoroReady = false;
        return false;
      }
      // --- End WebGPU Check ---

      // Use fp32 for WebGPU as recommended
      const dtype = 'fp32';
      console.log(`Attempting KokoroTTS init with device: ${device}, dtype: ${dtype}`);

      console.log(`Calling KokoroTTS.from_pretrained('${model_id}', { dtype: '${dtype}', device: '${device}' })...`);

      // --- Add Timeout Wrapper for from_pretrained ---
      const fromPretrainedPromise = window.KokoroTTS.from_pretrained(model_id, {
        dtype: dtype,
        device: device, // Always 'webgpu' if we reach here
      });

      const pretrainedTimeoutPromise = new Promise((_, reject) =>
        setTimeout(() => reject(new Error('KokoroTTS.from_pretrained (WebGPU) timed out after 55 seconds')), 55000) // 55 seconds timeout
      );

      try {
        this.kokoroInstance = await Promise.race([
          fromPretrainedPromise,
          pretrainedTimeoutPromise
        ]);
      } catch (timeoutError) {
        console.error(timeoutError.message); // Log the specific timeout error
        throw timeoutError; // Re-throw to be caught by the outer catch block
      }
      // --- End Timeout Wrapper ---

      console.log('KokoroTTS.from_pretrained call completed.');

      if (!this.kokoroInstance) {
        console.error('KokoroTTS.from_pretrained returned a falsy value.');
        throw new Error('KokoroTTS.from_pretrained returned null or undefined.');
      }

      // Defer AudioContext creation until first use

      this.kokoroReady = true;
      console.log('Kokoro TTS (WebGPU) instance created successfully (AudioContext deferred).');
      return true;
    } catch (error) {
      console.error('Error during KokoroTTS (WebGPU) initialization:', error);
      if (error.message) {
        console.error('Error message:', error.message);
      }
      if (error.stack) {
        console.error('Error stack:', error.stack);
      }
      this.kokoroInstance = null;
      this.kokoroReady = false;
      return false; // Ensure failure is explicitly returned
    }
  }

  /**
   * Determine the best device (webgpu or wasm)
   * Checks for WebGPU support.
   * @private
   */
  async getBestDevice() {
    if (navigator.gpu) {
      try {
        // Request an adapter. If this succeeds, WebGPU is likely available.
        const adapter = await navigator.gpu.requestAdapter();
        if (adapter) {
          console.log('WebGPU supported, selecting webgpu device.');
          return 'webgpu';
        }
        console.warn('WebGPU adapter request returned null.');
      } catch (e) {
        console.warn('WebGPU adapter request failed:', e);
      }
    }
    console.log('WebGPU not supported or available, cannot use GPU for Kokoro.');
    return 'wasm'; // Return wasm indicating GPU is not the best/available option
  }

  /**
   * List available voices (delegates to KokoroTTS instance)
   */
  async listVoices() {
    if (!this.kokoroReady || !this.kokoroInstance) {
      console.warn('Kokoro not ready, cannot list voices.');
      return [];
    }
    try {
      // The README uses tts.list_voices(), assuming it's a method on the instance
      if (typeof this.kokoroInstance.list_voices === 'function') {
        return await this.kokoroInstance.list_voices();
      } else {
        console.warn('list_voices method not found on KokoroTTS instance. Returning default.');
        // Fallback based on README examples
        return [{ name: 'af_heart', description: 'Default American Female' }];
      }
    } catch (error) {
      console.error('Error listing Kokoro voices:', error);
      return [];
    }
  }

  /**
   * Set the voice to use
   * @param {string} voiceName - Name of the voice (e.g., 'af_heart')
   */
  setVoice(voiceName) {
    this.currentVoice = voiceName;
    console.log(`Kokoro voice set to: ${voiceName}`);
  }

  /**
   * Toggle TTS functionality on/off
   * @returns {boolean} New state of TTS (enabled/disabled)
   */
  toggle() {
    // Set user activation flag when toggle is called
    this.hasUserActivation = true;

    // --- Create AudioContext on first activation ---
    if (!this.audioContext) {
      try {
        this.audioContext = new (window.AudioContext || window.webkitAudioContext)();
        console.log('AudioContext created on user activation.');
        // Resume if context starts suspended
        if (this.audioContext.state === 'suspended') {
          this.audioContext.resume().catch(err => console.error('Error resuming initial AudioContext:', err));
        }
      } catch (e) {
        console.error('Failed to create AudioContext:', e);
        // If AudioContext fails, Kokoro cannot play audio
        this.kokoroReady = false;
        return false;
      }
    }
    // --- End AudioContext Creation ---

    if (!this.kokoroReady) {
      console.warn('Kokoro TTS not ready yet');
      // Optionally, trigger re-initialization or inform user
      return false;
    }

    this.enabled = !this.enabled;
    console.log("Kokoro TTS toggled:", this.enabled ? "ON" : "OFF");

    // Stop any ongoing speech when disabling
    if (!this.enabled && (this.speaking || this.isProcessingQueue)) {
      this.stop();
    }

    return this.enabled;
  }

  /**
   * Set the speech rate/speed
   * @param {number} speed - Speed multiplier (0.1 to 2.0)
   */
  setSpeed(speed) {
    this.currentSpeed = Math.max(0.5, Math.min(2.0, speed));
  }

  /**
   * Process text for better speech synthesis
   * @param {string} text - Text to process
   * @returns {string} - Processed text
   */
  processTextForSpeech(text) {
    if (!text) return "";

    // Remove markdown/formatting that would sound strange when read
    text = text.replace(/\*\*([^*]+)\*\*/g, '$1'); // Bold
    text = text.replace(/\*([^*]+)\*/g, '$1');     // Italic
    text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1'); // Links

    // Clean up any HTML tags
    text = text.replace(/<[^>]+>/g, '');

    return text;
  }

  /**
   * Split text into digestible chunks for better TTS handling
   * @param {string} text - Text to split
   * @returns {string[]} - Array of text chunks
   */
  splitTextIntoChunks(text) {
    if (!text) return [];

    // Split by sentence terminators, keeping the terminator with the chunk
    const sentenceRegex = /[^.!?]+[.!?]+/g;
    const sentences = text.match(sentenceRegex) || [text];

    // Group sentences into chunks for better performance
    const chunks = [];
    let currentChunk = '';

    for (const sentence of sentences) {
      // If adding this sentence would make the chunk too long, start a new chunk
      if (currentChunk.length + sentence.length > 500) {
        if (currentChunk) chunks.push(currentChunk);
        currentChunk = sentence;
      } else {
        currentChunk += sentence;
      }
    }

    // Add the last chunk if it's not empty
    if (currentChunk) chunks.push(currentChunk);

    return chunks;
  }

  /**
   * Process the speech queue using KokoroTTS
   * @private
   */
  async processQueue() {
    // Ensure AudioContext is ready before processing
    if (!this.audioContext) {
      console.warn('AudioContext not available, cannot process Kokoro queue.');
      this.isProcessingQueue = false;
      this.speaking = false;
      return;
    }
    // Ensure AudioContext is running
    if (this.audioContext.state === 'suspended') {
      await this.audioContext.resume().catch(err => console.error('Error resuming AudioContext for queue:', err));
    }

    if (this.isProcessingQueue || this.audioQueue.length === 0 || !this.kokoroReady || !this.kokoroInstance) {
      if (this.audioQueue.length === 0) {
        this.speaking = false; // Ensure speaking flag is reset when queue is empty
      }
      // Reset processing flag if we exit early
      if (this.isProcessingQueue && this.audioQueue.length === 0) {
        this.isProcessingQueue = false;
      }
      return;
    }

    this.isProcessingQueue = true;
    this.speaking = true; // Set speaking true when processing starts

    try {
      const textChunk = this.audioQueue.shift();

      if (!textChunk) {
        this.isProcessingQueue = false;
        this.speaking = false;
        return;
      }

      console.log(`Kokoro generating chunk (${this.audioQueue.length} remaining):`, textChunk.substring(0, 30) + "...");

      try {
        // Use Kokoro instance to generate audio
        const audioResult = await this.kokoroInstance.generate(textChunk, {
          voice: this.currentVoice,
        });

        // --- Updated Check: Expect Float32Array ---
        if (!audioResult || !audioResult.audio || !(audioResult.audio instanceof Float32Array) || !audioResult.sampling_rate) {
          console.error('Invalid audio data or sampling rate received from KokoroTTS.generate', audioResult);
          throw new Error('Invalid audio data or sampling rate received from KokoroTTS.generate');
        }
        // --- End Updated Check ---

        const rawAudioSamples = audioResult.audio;
        const samplingRate = audioResult.sampling_rate;
        console.log(`Received raw audio samples (${rawAudioSamples.length}), sample rate: ${samplingRate}`);

        // Decode and play the raw audio samples
        await this.playRawAudio(rawAudioSamples, samplingRate);

      } catch (error) {
        console.error("Error generating or playing Kokoro speech:", error);
      } finally {
        // Always continue processing the queue
        this.isProcessingQueue = false;
        // Check if queue is now empty to reset speaking flag
        if (this.audioQueue.length === 0) {
          this.speaking = false;
          console.log("Kokoro queue finished.");
        }
        // Use setTimeout to avoid potential stack overflow on rapid processing
        setTimeout(() => this.processQueue(), 0);
      }
    } catch (error) {
      console.error("Error in Kokoro processQueue:", error);
      this.isProcessingQueue = false;
      this.speaking = false; // Reset speaking flag on error
    }
  }

  /**
   * Play raw Float32Array audio samples using Web Audio API
   * @param {Float32Array} samples - The raw audio samples
   * @param {number} sampleRate - The sample rate of the audio
   * @private
   */
  async playRawAudio(samples, sampleRate) {
    if (!this.audioContext) {
      console.error('AudioContext not initialized.');
      return;
    }
    if (this.audioContext.state === 'suspended') {
      await this.audioContext.resume().catch(err => console.error('Error resuming AudioContext for playback:', err));
    }

    try {
      // Create an AudioBuffer
      const audioBuffer = this.audioContext.createBuffer(
        1, // Number of channels (assuming mono)
        samples.length, // Length of the buffer
        sampleRate // Sample rate
      );

      // Copy the samples to the AudioBuffer
      // NOTE: If audio is stereo, this needs adjustment
      audioBuffer.copyToChannel(samples, 0);

      // Create a source node
      const source = this.audioContext.createBufferSource();
      source.buffer = audioBuffer;
      source.connect(this.audioContext.destination);

      // Store the current source to allow stopping
      this.currentAudioSource = source;

      console.log(`Playing audio buffer (${(samples.length / sampleRate).toFixed(2)}s)`);

      return new Promise((resolve) => {
        source.onended = () => {
          // Check if this source was the one we intended to stop
          if (this.currentAudioSource === source) {
            this.currentAudioSource = null;
          }
          console.log('Audio playback finished.');
          resolve();
        };
        source.start(0); // Start playback immediately
      });
    } catch (error) {
      console.error('Error creating or playing raw audio buffer:', error);
      this.currentAudioSource = null; // Clear source on error
    }
  }

  /**
   * Speak the provided text using KokoroTTS
   * @param {string} text - Text to be spoken
   * @param {function} onEndCallback - Callback when all speech ends
   */
  speak(text, onEndCallback = null) {
    if (!this.enabled || !text) {
      if (onEndCallback) onEndCallback();
      return;
    }

    // If kokoro is not ready yet, wait for initialization
    if (!this.kokoroReady) {
      console.warn("Kokoro TTS not ready yet, waiting for initialization...");
      this.initializationPromise.then(success => {
        if (success && this.enabled) {
          this._speakInternal(text, onEndCallback);
        } else {
          console.error("Kokoro failed to initialize, cannot speak.");
          if (onEndCallback) onEndCallback();
        }
      });
      return;
    }

    this._speakInternal(text, onEndCallback);
  }

  /**
   * Internal method to handle speech after initialization checks
   * @private
   */
  _speakInternal(text, onEndCallback) {
    // Ensure AudioContext is resumed after user interaction
    if (this.audioContext && this.audioContext.state === 'suspended') {
      this.audioContext.resume().catch(err => console.error('Error resuming AudioContext:', err));
    }

    // Don't attempt to speak without user activation
    if (!this.hasUserActivation) {
      console.warn("Not attempting to speak because there hasn't been user interaction yet");
      if (onEndCallback) onEndCallback();
      return;
    }

    try {
      const processedText = this.processTextForSpeech(text);
      console.log("Kokoro TTS attempting to speak:", processedText.substring(0, 50) + "...");

      // Stop any existing speech
      this.stop();

      // Split into manageable chunks (consider if Kokoro handles long text well)
      const chunks = this.splitTextIntoChunks(processedText);
      this.audioQueue = chunks;

      // Start processing the queue
      if (this.audioQueue.length > 0 && !this.isProcessingQueue) {
        this.processQueue();
      }

      // Set up a completion callback
      if (onEndCallback) {
        const checkCompletion = () => {
          if (!this.isSpeaking()) { // Check if speaking is false
            onEndCallback();
          } else {
            setTimeout(checkCompletion, 150); // Check again shortly
          }
        };
        // Start checking slightly after processing begins
        setTimeout(checkCompletion, 100);
      }
    } catch (error) {
      console.error("Error in Kokoro speak:", error);
      if (onEndCallback) onEndCallback();
    }
  }

  /**
   * Pause the current speech (Note: May not be perfectly resumable with AudioBufferSourceNode)
   */
  pause() {
    if (!this.speaking || !this.audioContext) return;
    // Suspending AudioContext is a way to pause, but resuming might not be seamless
    this.audioContext.suspend().catch(err => console.error('Error suspending AudioContext:', err));
    this.paused = true;
    console.log('Kokoro audio paused (via AudioContext suspend)');
  }

  /**
   * Resume paused speech
   */
  resume() {
    if (!this.paused || !this.audioContext) return;
    this.audioContext.resume().catch(err => console.error('Error resuming AudioContext:', err));
    this.paused = false;
    console.log('Kokoro audio resumed (via AudioContext resume)');
  }

  /**
   * Stop the current speech
   */
  stop() {
    // Stop any currently playing audio source
    if (this.currentAudioSource) {
      try {
        this.currentAudioSource.stop();
      } catch (e) {
        // Ignore errors if source already stopped
      }
      this.currentAudioSource = null;
    }

    // Clear the queue and reset flags
    this.audioQueue = [];
    this.isProcessingQueue = false;
    this.speaking = false;
    this.paused = false;
    console.log('Kokoro speech stopped and queue cleared.');
  }

  /**
   * Check if TTS is currently active/enabled
   */
  isEnabled() {
    return this.enabled && this.kokoroReady;
  }

  /**
   * Check if speech is currently in progress
   */
  isSpeaking() {
    // Consider both the processing flag and if an audio source is active
    return this.speaking || this.isProcessingQueue || !!this.currentAudioSource;
  }
}

// Don't create a global instance here - the factory will do this
// const ttsHandler = new KokoroHandler();