feat: pivot to always-active Whisper captioning and trigger word detection

2026-03-02 11:50:07 -05:00
parent de06244862
commit e7ec4d7ab2
2 changed files with 158 additions and 119 deletions
@@ -1,24 +1,51 @@
+import sys
 import os
 import subprocess
-import openwakeword
-from openwakeword.model import Model
-import pyaudio
-import numpy as np
-import speech_recognition as sr
 import time
 import re
+import queue
+import threading
+from unittest.mock import MagicMock
+
+# Comprehensive workaround for missing _lzma in some Python builds
+try:
+    import lzma
+except ImportError:
+    mock_lzma = MagicMock()
+    mock_lzma.FORMAT_XZ = 1
+    mock_lzma.FORMAT_ALONE = 2
+    mock_lzma.FORMAT_RAW = 3
+    mock_lzma.CHECK_NONE = 0
+    mock_lzma.CHECK_CRC32 = 1
+    mock_lzma.CHECK_CRC64 = 4
+    mock_lzma.CHECK_SHA256 = 10
+    sys.modules["_lzma"] = MagicMock()
+    sys.modules["lzma"] = mock_lzma
+
+import numpy as np
+import sounddevice as sd
+import torch
+import mlx_whisper
+from silero_vad import load_silero_vad, get_speech_timestamps
 from gtts import gTTS
 import pygame
 import io
+import sys
+
+# --- Configuration ---
+TRIGGER_WORD = "Jarvis"
+WHISPER_MODEL = "mlx-community/whisper-small-mlx" 
+SAMPLERATE = 16000
+BLOCK_SIZE = 512
+VAD_THRESHOLD = 0.5
+SILENCE_DURATION_MS = 1000
+MAX_BUFFER_SECONDS = 20
+CONTEXT_CHARS = 500  # How much previous text to keep for context

-# Configuration
-WAKE_WORD = "hey_jarvis"
-SENSITIVITY = 0.5
-SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff"
-FOLLOW_UP_SOUND = "/System/Library/Sounds/Submarine.aiff"
-USE_GTTS = False
 WORKSPACE_DIR = "workspace"
 SOUL_PATH = "soul.md"
+SYSTEM_SOUND = "/System/Library/Sounds/Tink.aiff"
+FOLLOW_UP_SOUND = "/System/Library/Sounds/Submarine.aiff"

 # Ensure workspace exists
 if not os.path.exists(WORKSPACE_DIR):
@@ -27,89 +54,66 @@ if not os.path.exists(WORKSPACE_DIR):
 # Initialize pygame mixer for audio playback
 pygame.mixer.init()

+# Global state
+audio_queue = queue.Queue()
+rolling_context = ""
+current_session_id = None
+
 def play_sound(sound_path=SYSTEM_SOUND):
    """Play a system sound asynchronously."""
    subprocess.Popen(["afplay", sound_path])

-# Global session tracker
-current_session_id = None
-
 def get_latest_session_id():
    """Retrieve the UUID of the most recent Gemini session."""
    try:
-        # Check sessions from the workspace context
        result = subprocess.run(["gemini", "--list-sessions"], capture_output=True, text=True, cwd=WORKSPACE_DIR)
-        # Find all UUIDs inside brackets [UUID]
        matches = re.findall(r"\[([a-f0-9\-]+)\]", result.stdout)
        if matches:
-            # Return the last one in the list
            return matches[-1]
    except Exception as e:
        print(f"Error fetching session ID: {e}")
    return None

 def speak_text(text):
-    """Speak text using the 'say' command (default) or gTTS if configured."""
+    """Speak text using the 'say' command."""
    if not text or text.strip() == "":
        return
    
    clean_text = text.replace("*", "").replace("#", "").replace("`", "")
-    
-    if USE_GTTS:
-        try:
-            tts = gTTS(text=clean_text, lang='en')
-            fp = io.BytesIO()
-            tts.write_to_fp(fp)
-            fp.seek(0)
-            pygame.mixer.music.load(fp)
-            pygame.mixer.music.play()
-            while pygame.mixer.music.get_busy():
-                pygame.time.Clock().tick(10)
-            return
-        except Exception as e:
-            print(f"Error in gTTS: {e}. Falling back to 'say'.")
-
+    print(f"[Jarvis] Speaking: {clean_text}")
    subprocess.run(["say", clean_text])

-def run_gemini(command, is_init=False):
+def run_gemini(command, context="", is_init=False):
    """Call the gemini CLI, capture output, and speak it."""
    global current_session_id
    
-    args = ["gemini", "--prompt", command, "--yolo"]
+    # Combine context and command if provided
+    full_prompt = command
+    if context:
+        full_prompt = f"Recent Context: {context}\n\nUser Command: {command}"
+    
+    args = ["gemini", "--prompt", full_prompt, "--yolo"]
    
    if current_session_id:
        args.extend(["--resume", current_session_id])
    
-    # Set up environment for Gemini CLI
    env = os.environ.copy()
    
    if os.path.exists(SOUL_PATH):
        with open(SOUL_PATH, 'r') as f:
            soul_content = f.read()
        
-        # Inject date/time context only on initialization
        if is_init:
            current_time = time.strftime("%A, %B %d, %Y, %I:%M %p")
            soul_content = f"Temporal Context: The current date and time is {current_time}.\n\n" + soul_content
-            print(f"\n[Jarvis] Initializing system protocol with temporal context...")
-        else:
-            print(f"\n[Jarvis] Communicating with Gemini...")
+            print(f"\n[Jarvis] Initializing system protocol...")
        
-        # Use a temporary file for the system instruction
        system_md_path = os.path.abspath(os.path.join(WORKSPACE_DIR, ".system_prompt.md"))
        with open(system_md_path, 'w') as f:
            f.write(soul_content)
        env["GEMINI_SYSTEM_MD"] = system_md_path
-    else:
-        if is_init:
-            print(f"\n[Jarvis] Initializing system protocol...")
-        else:
-            print(f"\n[Jarvis] Communicating with Gemini...")
-
-    print(f"[Jarvis] Executing: {' '.join(args)} in {WORKSPACE_DIR}")

    try:
-        # All Gemini commands run inside the workspace directory
        process = subprocess.run(args, capture_output=True, text=True, cwd=WORKSPACE_DIR, env=env)
        response = process.stdout.strip()
        
@@ -126,76 +130,108 @@ def run_gemini(command, is_init=False):
    except Exception as e:
        print(f"Error running gemini: {e}")

-# --- Startup Sequence ---
+def audio_callback(indata, frames, time, status):
+    if status:
+        print(status, file=sys.stderr)
+    audio_queue.put(indata.copy())

-model = Model(wakeword_models=[WAKE_WORD], inference_framework="onnx")
+def main():
+    global rolling_context
    
-CHUNK = 1280
-FORMAT = pyaudio.paInt16
-CHANNELS = 1
-RATE = 16000
+    print("[Jarvis] Loading models...")
+    device = "mps" if torch.backends.mps.is_available() else "cpu"
+    print(f"[Jarvis] Using device: {device}")
    
-audio = pyaudio.PyAudio()
-stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
-
-recognizer = sr.Recognizer()
-recognizer.pause_threshold = 1.2
-recognizer.non_speaking_duration = 0.5
-mic = sr.Microphone()
-
-print("[Jarvis] Calibrating for ambient noise...")
-with mic as source:
-    recognizer.adjust_for_ambient_noise(source, duration=1)
+    vad_model = load_silero_vad()
+    print("[Jarvis] Models loaded.")

    print("[Jarvis] Booting system protocols...")
-current_time = time.strftime("%A, %B %d, %Y, %I:%M %p")
-run_gemini(f"System initialization complete. The current date and time is {current_time}. Awaiting orders, Sir.", is_init=True)
+    run_gemini("System initialization complete. Awaiting orders, Sir.", is_init=True)

-print(f"Listening for '{WAKE_WORD}'...")
+    print(f"[Jarvis] Always-active mic enabled. Listening for '{TRIGGER_WORD}'...")
+    
+    audio_buffer = []
+    speech_started = False
+    last_change_time = time.time()
    
    try:
+        with sd.InputStream(samplerate=SAMPLERATE, channels=1, callback=audio_callback, blocksize=BLOCK_SIZE):
            while True:
-        data = stream.read(CHUNK, exception_on_overflow=False)
-        audio_frame = np.frombuffer(data, dtype=np.int16)
-        prediction = model.predict(audio_frame)
+                while not audio_queue.empty():
+                    data = audio_queue.get()
+                    audio_buffer.append(data.flatten())

-        if prediction[WAKE_WORD] > SENSITIVITY:
-            print(f"\n[Jarvis] Wake word detected! (Score: {prediction[WAKE_WORD]:.2f})")
-            stream.stop_stream()
+                if len(audio_buffer) > 0:
+                    current_audio = np.concatenate(audio_buffer)
+                    audio_tensor = torch.from_numpy(current_audio)
+                    buffer_duration = len(current_audio) / SAMPLERATE
                    
-            in_conversation = True
-            first_listening = True
+                    speech_timestamps = get_speech_timestamps(
+                        audio_tensor, 
+                        vad_model, 
+                        sampling_rate=SAMPLERATE,
+                        threshold=VAD_THRESHOLD,
+                        min_silence_duration_ms=SILENCE_DURATION_MS
+                    )

-            while in_conversation:
-                play_sound(SYSTEM_SOUND if first_listening else FOLLOW_UP_SOUND)
-                print("[Jarvis] Listening...")
+                    # Watchdog to prevent buffer bloat
+                    if buffer_duration > MAX_BUFFER_SECONDS:
+                        print("[Jarvis] Buffer limit reached. Resetting...")
+                        audio_buffer = []
+                        speech_started = False
+                        continue

-                with mic as source:
-                    try:
-                        audio_cmd = recognizer.listen(source, timeout=10, phrase_time_limit=15)
+                    if len(speech_timestamps) > 0:
+                        speech_started = True
+                        last_end = speech_timestamps[-1]['end']
+                        buffer_len_samples = len(current_audio)
+                        
+                        # Check if speech has ended (silence after last speech)
+                        if (buffer_len_samples - last_end) > (SAMPLERATE * SILENCE_DURATION_MS / 1000):
+                            
+                            # Transcribe
                            print("[Jarvis] Transcribing...")
-                        command = recognizer.recognize_google(audio_cmd)
-                        print(f"[Jarvis] You said: {command}")
+                            result = mlx_whisper.transcribe(current_audio, path_or_hf_repo=WHISPER_MODEL)
+                            text = result['text'].strip()
                            
-                        run_gemini(command)
-                        first_listening = False
+                            if text:
+                                print(f"[Caption]: {text}")
                                
-                    except sr.WaitTimeoutError:
-                        print("[Jarvis] Session timed out.")
-                        in_conversation = False
-                    except sr.UnknownValueError:
-                        print("[Jarvis] No speech detected. Ending session.")
-                        in_conversation = False
-                    except sr.RequestError as e:
-                        print(f"[Jarvis] Speech service error: {e}")
-                        in_conversation = False
+                                # Detect Trigger Word
+                                trigger_match = re.search(rf"\b{TRIGGER_WORD}\b", text, re.IGNORECASE)
+                                if trigger_match:
+                                    print(f"[Jarvis] Trigger word detected!")
+                                    play_sound(SYSTEM_SOUND)
                                    
-            stream.start_stream()
-            print(f"\nListening for '{WAKE_WORD}'...")
+                                    # Extract command (text after trigger word)
+                                    start_idx = trigger_match.end()
+                                    command = text[start_idx:].strip()
+                                    
+                                    if not command:
+                                        print("[Jarvis] No command following trigger word. Using full text.")
+                                        command = text
+                                    
+                                    # Call Gemini with context
+                                    run_gemini(command, context=rolling_context)
+                                    
+                                    # Update context with this exchange
+                                    rolling_context = (rolling_context + " " + text)[-CONTEXT_CHARS:].strip()
+                                else:
+                                    # Update context with current transcription
+                                    rolling_context = (rolling_context + " " + text)[-CONTEXT_CHARS:].strip()
+
+                            # Reset buffer after processing
+                            audio_buffer = []
+                            speech_started = False
+                    
+                    elif not speech_started and len(current_audio) > SAMPLERATE * 2:
+                        # Clear buffer if no speech detected for 2 seconds
+                        audio_buffer = []

    except KeyboardInterrupt:
-    print("\nStopping...")
-finally:
-    stream.stop_stream()
-    stream.close()
-    audio.terminate()
+        print("\n[Jarvis] Shutting down...")
+    except Exception as e:
+        print(f"\n[Jarvis] Fatal Error: {e}")
+
+if __name__ == "__main__":
+    main()
@@ -1,6 +1,9 @@
-openwakeword
-pyaudio
-requests
-SpeechRecognition
+mlx-whisper
+sounddevice
+torch
+silero-vad
+transformers
 numpy
-onnxruntime
+requests
+pygame
+gTTS